From 0150d071ffc76d82edeaf732c2db82cbeef52177 Mon Sep 17 00:00:00 2001 From: "mafetter@fleming.research" Date: Mon, 14 Mar 2005 22:07:47 +0000 Subject: [PATCH] bitkeeper revision 1.1236.32.2 (42360b33-HudAOddVBt3ez4shMiyOw) Initial fullshadow checkin. Things still to do: - reuse snapshots intelligently. - minimize tlb flushes during resync. - figure out when to free up no-longer-used L2 shadows, and generally deal with out-of-memory kinds of problems. Some basic guidelines: - With fullshadow on, you can not trust linear_pg_table unless you have first checked whether the VA in which you are interested is out-of-sync or not. - Significant new functions/macros include: page_out_of_sync(mfn): returns true if page is out of sync. shadow_mark_out_of_sync: make a page be out of sync (allocating any necessary snapshots, etc) shadow_out_of_sync(va): returns true if the current mappings involved in va are out-of-sync. shadow_sync_va(): bring the pages involved in mapping a particular va back into sync. Currently calls shadow_sync_all(). shadow_sync_all(): bring all pages back in-sync. Signed-off-by: michael.fetterman@cl.cam.ac.uk --- .rootkeys | 1 + xen/arch/x86/audit.c | 817 +++++++++++ xen/arch/x86/domain.c | 71 +- xen/arch/x86/mm.c | 1031 +++++--------- xen/arch/x86/shadow.c | 2083 ++++++++++++++++++++++------ xen/arch/x86/traps.c | 11 +- xen/arch/x86/vmx.c | 27 +- xen/arch/x86/x86_32/domain_build.c | 14 +- xen/arch/x86/x86_32/domain_page.c | 2 + xen/common/dom_mem_ops.c | 22 + xen/common/keyhandler.c | 2 +- xen/common/page_alloc.c | 19 +- xen/common/schedule.c | 3 + xen/include/asm-x86/domain.h | 22 +- xen/include/asm-x86/mm.h | 69 +- xen/include/asm-x86/page.h | 6 +- xen/include/asm-x86/shadow.h | 877 ++++++++---- xen/include/asm-x86/x86_32/page.h | 2 +- xen/include/xen/domain.h | 2 - xen/include/xen/perfc_defn.h | 52 +- 20 files changed, 3587 insertions(+), 1546 deletions(-) create mode 100644 xen/arch/x86/audit.c diff --git a/.rootkeys b/.rootkeys index 996aac70b0..6148c1a213 100644 --- a/.rootkeys +++ b/.rootkeys @@ -939,6 +939,7 @@ 3ddb79bcBQF85CfLS4i1WGZ4oLLaCA xen/arch/x86/Rules.mk 3e5636e5FAYZ5_vQnmgwFJfSdmO5Mw xen/arch/x86/acpi.c 3ddb79bcsjinG9k1KcvbVBuas1R2dA xen/arch/x86/apic.c +42360b3244-Q6BpEKhR_A1YtG1wPNQ xen/arch/x86/audit.c 3ddb79c4yGZ7_22QAFFwPzqP4NSHwA xen/arch/x86/boot/mkelf32.c 3ddb79bcSC_LvnmFlX-T5iTgaR0SKg xen/arch/x86/boot/x86_32.S 40e42bdbNu4MjI750THP_8J1S-Sa0g xen/arch/x86/boot/x86_64.S diff --git a/xen/arch/x86/audit.c b/xen/arch/x86/audit.c new file mode 100644 index 0000000000..1c5b89fa2f --- /dev/null +++ b/xen/arch/x86/audit.c @@ -0,0 +1,817 @@ +/****************************************************************************** + * arch/x86/audit.c + * + * Copyright (c) 2002-2005 K A Fraser + * Copyright (c) 2004 Christian Limpach + * Copyright (c) 2005 Michael A Fetterman + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +//#include +//#include +#include +//#include +//#include +#include +#include +#include +//#include +//#include +//#include +//#include + +// XXX SMP bug -- these should not be statics... +// +static int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0; +static int l1, l2, oos_count, page_count; + +#define FILE_AND_LINE 1 + +#if FILE_AND_LINE +#define adjust(_p, _a) _adjust((_p), (_a), __FILE__, __LINE__) +#define ADJUST_EXTRA_ARGS ,const char *file, int line +#define APRINTK(_f, _a...) printk(_f " %s:%d\n", ## _a, file, line) +#else +#define adjust _adjust +#define ADJUST_EXTRA_ARGS +#define APRINTK(_f, _a...) printk(_f "\n", ##_a) +#endif + +int audit_adjust_pgtables(struct domain *d, int dir, int noisy) +{ + int errors = 0; + int shadow_enabled = shadow_mode_enabled(d) ? 1 : 0; + + void _adjust(struct pfn_info *page, int adjtype ADJUST_EXTRA_ARGS) + { + if ( adjtype ) + { + // adjust the type count + // + int tcount = page->u.inuse.type_info & PGT_count_mask; + tcount += dir; + ttot++; + + if ( page_get_owner(page) == NULL ) + { + APRINTK("adjust(mfn=%p, dir=%d, adjtype=%d) owner=NULL", + page_to_pfn(page), dir, adjtype, file, line); + errors++; + } + + if ( tcount < 0 ) + { + APRINTK("Audit %d: type count went below zero mfn=%x t=%x ot=%x", + d->id, page-frame_table, + page->u.inuse.type_info, + page->tlbflush_timestamp); + errors++; + } + else if ( (tcount & ~PGT_count_mask) != 0 ) + { + APRINTK("Audit %d: type count overflowed mfn=%x t=%x ot=%x", + d->id, page-frame_table, + page->u.inuse.type_info, + page->tlbflush_timestamp); + errors++; + } + else + page->u.inuse.type_info += dir; + } + + // adjust the general count + // + int count = page->count_info & PGC_count_mask; + count += dir; + ctot++; + + if ( count < 0 ) + { + APRINTK("Audit %d: general count went below zero pfn=%x t=%x ot=%x", + d->id, page-frame_table, + page->u.inuse.type_info, + page->tlbflush_timestamp); + errors++; + } + else if ( (count & ~PGT_count_mask) != 0 ) + { + APRINTK("Audit %d: general count overflowed pfn=%x t=%x ot=%x", + d->id, page-frame_table, + page->u.inuse.type_info, + page->tlbflush_timestamp); + errors++; + } + else + page->count_info += dir; + } + + void adjust_l2_page(unsigned long mfn, int adjtype) + { + unsigned long *pt = map_domain_mem(mfn << PAGE_SHIFT); + int i, limit; + + if ( shadow_mode_external(d) ) + limit = L2_PAGETABLE_ENTRIES; + else + limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; + + for ( i = 0; i < limit; i++ ) + { + if ( pt[i] & _PAGE_PRESENT ) + { + unsigned long l1mfn = pt[i] >> PAGE_SHIFT; + struct pfn_info *l1page = pfn_to_page(l1mfn); + + if ( noisy ) + { + if ( shadow_enabled ) + { + if ( page_get_owner(l1page) != NULL ) + { + printk("L2: Bizarre shadow L1 page mfn=%p " + "belonging to a domain %p (id=%d)\n", + l1mfn, + page_get_owner(l1page), + page_get_owner(l1page)->id); + errors++; + continue; + } + } + else + { + if ( page_get_owner(l1page) != d ) + { + printk("L2: Skip bizarre L1 page mfn=%p " + "belonging to other dom %p (id=%d)\n", + l1mfn, + page_get_owner(l1page), + page_get_owner(l1page)->id); + errors++; + continue; + } + + u32 page_type = l1page->u.inuse.type_info & PGT_type_mask; + + if ( page_type == PGT_l2_page_table ) + { + printk("Audit %d: [%x] Found %s Linear PT " + "t=%x mfn=%p\n", + d->id, i, (l1mfn==mfn) ? "Self" : "Other", + l1page->u.inuse.type_info, l1mfn); + } + else if ( page_type != PGT_l1_page_table ) + { + printk("Audit %d: [L2 mfn=%p i=%x] " + "Expected L1 t=%x mfn=%p\n", + d->id, mfn, i, + l1page->u.inuse.type_info, l1mfn); + errors++; + } + } + } + + adjust(l1page, adjtype); + } + } + + unmap_domain_mem(pt); + } + + void adjust_l1_page(unsigned long l1mfn) + { + unsigned long *pt = map_domain_mem(l1mfn << PAGE_SHIFT); + int i; + + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + { + if ( pt[i] & _PAGE_PRESENT ) + { + unsigned long gmfn = pt[i] >> PAGE_SHIFT; + struct pfn_info *gpage = pfn_to_page(gmfn); + + if ( gmfn < 0x100 ) + { + lowmem_mappings++; + continue; + } + + if ( gmfn > max_page ) + { + io_mappings++; + continue; + } + + if ( noisy ) + { + if ( pt[i] & _PAGE_RW ) + { + // If it's not a writable page, complain. + // + if ( !((gpage->u.inuse.type_info & PGT_type_mask) == + PGT_writable_page) ) + { + printk("Audit %d: [l1mfn=%p, i=%x] Illegal RW " + "t=%x mfn=%p\n", + d->id, l1mfn, i, + gpage->u.inuse.type_info, gmfn); + errors++; + } + + if ( shadow_enabled && + page_is_page_table(gpage) && + ! page_out_of_sync(gpage) ) + { + printk("Audit %d: [l1mfn=%p, i=%x] Illegal RW of " + "page table gmfn=%p\n", + d->id, l1mfn, i, gmfn); + errors++; + } + } + + if ( page_get_owner(gpage) != d ) + { + printk("Audit %d: [l1mfn=%p,i=%x] Skip foreign page " + "dom=%p (id=%d) mfn=%p c=%08x t=%08x\n", + d->id, l1mfn, i, + page_get_owner(gpage), + page_get_owner(gpage)->id, + gmfn, + gpage->count_info, + gpage->u.inuse.type_info); + continue; + } + } + + adjust(gpage, (pt[i] & _PAGE_RW) ? 1 : 0); + } + } + + unmap_domain_mem(pt); + } + + void adjust_shadow_tables() + { + struct shadow_status *a; + unsigned long smfn, gmfn; + struct pfn_info *page; + int i; + + for ( i = 0; i < shadow_ht_buckets; i++ ) + { + a = &d->arch.shadow_ht[i]; + while ( a && a->gpfn_and_flags ) + { + gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask); + smfn = a->smfn; + page = &frame_table[smfn]; + + adjust(pfn_to_page(gmfn), 0); + + switch ( a->gpfn_and_flags & PGT_type_mask ) { + case PGT_snapshot: + break; + case PGT_l1_shadow: + case PGT_hl2_shadow: + adjust_l1_page(smfn); + if ( page->u.inuse.type_info & PGT_pinned ) + adjust(page, 0); + break; + case PGT_l2_shadow: + adjust_l2_page(smfn, 0); + if ( page->u.inuse.type_info & PGT_pinned ) + adjust(page, 0); + break; + default: + BUG(); + break; + } + + a = a->next; + } + } + } + + void adjust_oos_list() + { + struct out_of_sync_entry *oos; + + if ( (oos = d->arch.out_of_sync) ) + ASSERT(shadow_enabled); + + while ( oos ) + { + adjust(pfn_to_page(oos->gmfn), 0); + + // Only use entries that have low bits clear... + // + if ( !(oos->writable_pl1e & (sizeof(l1_pgentry_t)-1)) ) + adjust(pfn_to_page(oos->writable_pl1e >> PAGE_SHIFT), 0); + + oos = oos->next; + oos_count++; + } + } + + void adjust_for_pgtbase() + { + struct exec_domain *ed; + + for_each_exec_domain(d, ed) + { + if ( !shadow_enabled ) + { + if ( pagetable_val(ed->arch.guest_table) ) + adjust(&frame_table[pagetable_val(ed->arch.guest_table) + >> PAGE_SHIFT], 1); + } + else + { + if ( pagetable_val(ed->arch.guest_table) ) + adjust(&frame_table[pagetable_val(ed->arch.guest_table) + >> PAGE_SHIFT], 0); + if ( pagetable_val(ed->arch.shadow_table) ) + adjust(&frame_table[pagetable_val(ed->arch.shadow_table) + >> PAGE_SHIFT], 0); + } + } + } + + void adjust_guest_pages() + { + struct list_head *list_ent = d->page_list.next; + struct pfn_info *page; + unsigned long mfn; + + while ( list_ent != &d->page_list ) + { + u32 page_type; + + page = list_entry(list_ent, struct pfn_info, list); + mfn = page_to_pfn(page); + page_type = page->u.inuse.type_info & PGT_type_mask; + + if ( page_get_owner(page) != d ) + BUG(); + + page_count++; + + switch ( page_type ) + { + case PGT_l2_page_table: + l2++; + + if ( noisy ) + { + if ( shadow_enabled ) + { + printk("Audit %d: found an L2 guest page " + "mfn=%p t=%08x c=%08x while in shadow mode\n", + mfn, page->u.inuse.type_info, page->count_info); + errors++; + } + + if ( (page->u.inuse.type_info & PGT_validated) != + PGT_validated ) + { + printk("Audit %d: L2 mfn=%p not validated %p\n", + d->id, mfn, page->u.inuse.type_info); + errors++; + } + + if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) + { + printk("Audit %d: L2 mfn=%p not pinned t=%p\n", + d->id, mfn, page->u.inuse.type_info); + errors++; + } + } + + if ( page->u.inuse.type_info & PGT_pinned ) + adjust(page, 1); + + if ( page->u.inuse.type_info & PGT_validated ) + adjust_l2_page(mfn, 1); + + break; + + case PGT_l1_page_table: + l1++; + + if ( noisy ) + { + if ( shadow_enabled ) + { + printk("found an L1 guest page mfn=%p t=%08x c=%08x while in shadow mode\n", + mfn, page->u.inuse.type_info, page->count_info); + errors++; + } + + if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) + { + printk("Audit %d: L1 not validated mfn=%p t=%p\n", + d->id, mfn, page->u.inuse.type_info); + errors++; + } + + if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) + { + if ( !VM_ASSIST(d, VMASST_TYPE_writable_pagetables) ) + { + printk("Audit %d: L1 mfn=%p not pinned t=%p\n", + d->id, mfn, page->u.inuse.type_info); + errors++; + } + } + } + + if ( page->u.inuse.type_info & PGT_pinned ) + adjust(page, 1); + + if ( page->u.inuse.type_info & PGT_validated ) + adjust_l1_page(mfn); + + break; + + case PGT_gdt_page: + ASSERT( !page_out_of_sync(page) ); + adjust(page, 1); + break; + + case PGT_ldt_page: + ASSERT( !page_out_of_sync(page) ); + adjust(page, 1); + break; + + case PGT_writable_page: + if ( shadow_enabled ) + { + // In shadow mode, writable pages can get pinned by + // paravirtualized guests that think they are pinning + // their L1s and/or L2s. + // + if ( page->u.inuse.type_info & PGT_pinned ) + adjust(page, 1); + } + } + + list_ent = page->list.next; + } + } + + adjust_for_pgtbase(); + + adjust_guest_pages(); + + if ( shadow_enabled ) + { + adjust_oos_list(); + adjust_shadow_tables(); + } + + return errors; +} + + +#ifndef NDEBUG + +void _audit_domain(struct domain *d, int flags, const char *file, int line) +{ + void scan_for_pfn_in_mfn(struct domain *d, unsigned long xmfn, + unsigned long mfn) + { + struct pfn_info *page = &frame_table[mfn]; + unsigned long *pt = map_domain_mem(mfn); + int i; + + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + { + if ( (pt[i] & _PAGE_PRESENT) && ((pt[i] >> PAGE_SHIFT) == xmfn) ) + printk(" found dom=%d mfn=%p t=%x c=%x pt[i=%x]=%p\n", + d->id, mfn, page->u.inuse.type_info, + page->count_info, i, pt[i]); + } + + unmap_domain_mem(pt); + } + + void scan_for_pfn(struct domain *d, unsigned long xmfn) + { + if ( !shadow_mode_enabled(d) ) + { + struct list_head *list_ent = d->page_list.next; + struct pfn_info *page; + + while ( list_ent != &d->page_list ) + { + page = list_entry(list_ent, struct pfn_info, list); + + switch ( page->u.inuse.type_info & PGT_type_mask ) + { + case PGT_l1_page_table: + case PGT_l2_page_table: + scan_for_pfn_in_mfn(d, xmfn, page_to_pfn(page)); + break; + default: + break; + } + + list_ent = page->list.next; + } + } + else + { + struct shadow_status *a; + int i; + + for ( i = 0; i < shadow_ht_buckets; i++ ) + { + a = &d->arch.shadow_ht[i]; + while ( a && a->gpfn_and_flags ) + { + switch ( a->gpfn_and_flags & PGT_type_mask ) + { + case PGT_l1_shadow: + case PGT_l2_shadow: + case PGT_hl2_shadow: + scan_for_pfn_in_mfn(d, xmfn, a->smfn); + break; + case PGT_snapshot: + break; + default: + BUG(); + break; + } + a = a->next; + } + } + } + } + + void scan_for_pfn_remote(unsigned long xmfn) + { + struct domain *e; + for_each_domain ( e ) + scan_for_pfn( e, xmfn ); + } + + unsigned long mfn; + struct list_head *list_ent; + struct pfn_info *page; + int errors = 0; + + if ( d != current->domain ) + domain_pause(d); + synchronise_pagetables(~0UL); + + // Maybe we should just be using BIGLOCK? + // + if ( !(flags & AUDIT_ALREADY_LOCKED) ) + shadow_lock(d); + + spin_lock(&d->page_alloc_lock); + + /* PHASE 0 */ + + list_ent = d->page_list.next; + while ( list_ent != &d->page_list ) + { + u32 page_type; + + page = list_entry(list_ent, struct pfn_info, list); + mfn = page_to_pfn(page); + page_type = page->u.inuse.type_info & PGT_type_mask; + + if ( page_get_owner(page) != d ) + BUG(); + + if ( (page->u.inuse.type_info & PGT_count_mask) > + (page->count_info & PGC_count_mask) ) + { + printk("taf(%08x) > caf(%08x) mfn=%p\n", + page->u.inuse.type_info, page->count_info, mfn); + errors++; + } + + if ( shadow_mode_enabled(d) && + (page_type == PGT_writable_page) && + !(page->u.inuse.type_info & PGT_validated) ) + { + printk("shadow mode writable page not validated mfn=%p t=%08x c=%08x\n", + mfn, page->u.inuse.type_info, page->count_info); + errors++; + } + +#if 0 /* SYSV shared memory pages plus writeable files. */ + if ( page_type == PGT_writable_page && + (page->u.inuse.type_info & PGT_count_mask) > 1 ) + { + printk("writeable page with type count >1: mfn=%lx t=%x c=%x\n", + mfn, + page->u.inuse.type_info, + page->count_info ); + errors++; + scan_for_pfn_remote(mfn); + } +#endif + + if ( page_type == PGT_none && + (page->u.inuse.type_info & PGT_count_mask) > 0 ) + { + printk("normal page with type count >0: mfn=%lx t=%x c=%x\n", + mfn, + page->u.inuse.type_info, + page->count_info ); + errors++; + } + + if ( page_out_of_sync(page) ) + { + if ( !page_is_page_table(page) ) + { + printk("out of sync page mfn=%p is not a page table\n", mfn); + errors++; + } + unsigned long pfn = __mfn_to_gpfn(d, mfn); + if ( !__shadow_status(d, pfn, PGT_snapshot) ) + { + printk("out of sync page mfn=%p doesn't have a snapshot\n"); + errors++; + } + if ( page_type != PGT_writable_page ) + { + printk("out of sync page mfn=%p has strange type t=%08x c=%08x\n", + mfn, page->u.inuse.type_info, page->count_info); + errors++; + } + } + + /* Use tlbflush_timestamp to store original type_info. */ + page->tlbflush_timestamp = page->u.inuse.type_info; + + list_ent = page->list.next; + } + + /* PHASE 1 */ + io_mappings = lowmem_mappings = 0; + + errors += audit_adjust_pgtables(d, -1, 1); + + if ( !(flags & AUDIT_QUIET) && + ((io_mappings > 0) || (lowmem_mappings > 0)) ) + printk("Audit %d: Found %d lowmem mappings and %d io mappings\n", + d->id, lowmem_mappings, io_mappings); + + /* PHASE 2 */ + + list_ent = d->page_list.next; + while ( list_ent != &d->page_list ) + { + page = list_entry(list_ent, struct pfn_info, list); + mfn = page_to_pfn(page); + + switch ( page->u.inuse.type_info & PGT_type_mask) + { + case PGT_l1_page_table: + case PGT_l2_page_table: + if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) + { + printk("Audit %d: type count!=0 t=%x ot=%x c=%x mfn=%lx\n", + d->id, page->u.inuse.type_info, + page->tlbflush_timestamp, + page->count_info, mfn); + errors++; + scan_for_pfn_remote(mfn); + } + break; + case PGT_none: + case PGT_writable_page: + case PGT_gdt_page: + case PGT_ldt_page: + if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) + { + printk("Audit %d: type count!=0 t=%x ot=%x c=%x mfn=%lx\n", + d->id, page->u.inuse.type_info, + page->tlbflush_timestamp, + page->count_info, mfn); + errors++; + } + break; + default: + BUG(); // XXX fix me... + } + + if ( (page->count_info & PGC_count_mask) != 1 ) + { + printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x mfn=%lx\n", + d->id, + page->count_info, + page->u.inuse.type_info, + page->tlbflush_timestamp, mfn ); + errors++; + scan_for_pfn_remote(mfn); + } + + list_ent = page->list.next; + } + + if ( shadow_mode_enabled(d) ) + { + struct shadow_status *a; + struct pfn_info *page; + u32 page_type; + int i; + + for ( i = 0; i < shadow_ht_buckets; i++ ) + { + a = &d->arch.shadow_ht[i]; + while ( a && a->gpfn_and_flags ) + { + page = pfn_to_page(a->smfn); + page_type = a->gpfn_and_flags & PGT_type_mask; + + switch ( page_type ) { + case PGT_snapshot: + // XXX -- what should we check here? + break; + case PGT_l1_shadow: + case PGT_l2_shadow: + if ( ((page->u.inuse.type_info & PGT_type_mask) != page_type ) || + (page->count_info != 0) ) + { + printk("Audit %d: shadow page counts wrong mfn=%p t=%x c=%x\n", + d->id, page_to_pfn(page), + page->u.inuse.type_info, + page->count_info); + errors++; + } + break; + + case PGT_hl2_shadow: // haven't thought about this case yet. + default: + BUG(); + break; + } + + a = a->next; + } + } + } + + /* PHASE 3 */ + ctot = ttot = page_count = l1 = l2 = oos_count = 0; + + audit_adjust_pgtables(d, 1, 0); + +#if 0 + // This covers our sins of trashing the tlbflush_timestamps... + // + local_flush_tlb(); +#endif + + spin_unlock(&d->page_alloc_lock); + + if ( !(flags & AUDIT_QUIET) ) + printk("Audit dom%d (%s:%d) Done. " + "pages=%d oos=%d l1=%d l2=%d ctot=%d ttot=%d\n", + d->id, file, line, page_count, oos_count, l1, l2, ctot, ttot ); + + if ( !(flags & AUDIT_ALREADY_LOCKED) ) + shadow_unlock(d); + + if ( d != current->domain ) + domain_unpause(d); + + if ( errors && !(flags & AUDIT_ERRORS_OK) ) + BUG(); +} + +void audit_domains(void) +{ + struct domain *d; + for_each_domain ( d ) + audit_domain(d); +} + +void audit_domains_key(unsigned char key) +{ + audit_domains(); +} +#endif diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index f2b46e8c07..029d5fd5a4 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -247,10 +247,9 @@ void arch_do_createdomain(struct exec_domain *ed) machine_to_phys_mapping[virt_to_phys(d->arch.mm_perdomain_pt) >> PAGE_SHIFT] = INVALID_M2P_ENTRY; ed->arch.perdomain_ptes = d->arch.mm_perdomain_pt; -#if 0 /* don't need this yet, but maybe soon! */ - ed->arch.guest_vtable = linear_l2_table; - ed->arch.shadow_vtable = shadow_linear_l2_table; -#endif + + ed->arch.guest_vtable = __linear_l2_table; + ed->arch.shadow_vtable = __shadow_linear_l2_table; #ifdef __x86_64__ d->arch.mm_perdomain_l2 = (l2_pgentry_t *)alloc_xenheap_page(); @@ -295,70 +294,6 @@ void arch_vmx_do_launch(struct exec_domain *ed) reset_stack_and_jump(vmx_asm_do_launch); } -unsigned long alloc_monitor_pagetable(struct exec_domain *ed) -{ - unsigned long mmfn; - l2_pgentry_t *mpl2e; - struct pfn_info *mmfn_info; - struct domain *d = ed->domain; - - ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */ - - mmfn_info = alloc_domheap_page(NULL); - ASSERT( mmfn_info ); - - mmfn = (unsigned long) (mmfn_info - frame_table); - mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT); - memset(mpl2e, 0, PAGE_SIZE); - - memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - - mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = - mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK) - | __PAGE_HYPERVISOR); - - ed->arch.monitor_vtable = mpl2e; - - // map the phys_to_machine map into the Read-Only MPT space for this domain - mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = - mk_l2_pgentry(pagetable_val(ed->arch.phys_table) | __PAGE_HYPERVISOR); - - return mmfn; -} - -/* - * Free the pages for monitor_table and hl2_table - */ -static void free_monitor_pagetable(struct exec_domain *ed) -{ - l2_pgentry_t *mpl2e; - unsigned long mfn; - - ASSERT( pagetable_val(ed->arch.monitor_table) ); - - mpl2e = ed->arch.monitor_vtable; - - /* - * First get the mfn for hl2_table by looking at monitor_table - */ - mfn = l2_pgentry_val(mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) - >> PAGE_SHIFT; - - free_domheap_page(&frame_table[mfn]); - unmap_domain_mem(mpl2e); - - /* - * Then free monitor_table. - */ - mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT; - free_domheap_page(&frame_table[mfn]); - - ed->arch.monitor_table = mk_pagetable(0); - ed->arch.monitor_vtable = 0; -} - static int vmx_final_setup_guest(struct exec_domain *ed, full_execution_context_t *full_context) { diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 4a6254bac5..0a7be0e313 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -104,19 +104,12 @@ #ifdef VERBOSE #define MEM_LOG(_f, _a...) \ - printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \ + printk("DOM%u: MEM_LOG(line=%d) " _f "\n", \ current->domain->id , __LINE__ , ## _a ) #else #define MEM_LOG(_f, _a...) ((void)0) #endif -static int alloc_l2_table(struct pfn_info *page); -static int alloc_l1_table(struct pfn_info *page); -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d); -static int get_page_and_type_from_pagenr(unsigned long page_nr, - u32 type, - struct domain *d); - static void free_l2_table(struct pfn_info *page); static void free_l1_table(struct pfn_info *page); @@ -222,7 +215,7 @@ static void __invalidate_shadow_ldt(struct exec_domain *d) } -static inline void invalidate_shadow_ldt(struct exec_domain *d) +void invalidate_shadow_ldt(struct exec_domain *d) { if ( d->arch.shadow_ldt_mapcnt != 0 ) __invalidate_shadow_ldt(d); @@ -254,21 +247,41 @@ int map_ldt_shadow_page(unsigned int off) { struct exec_domain *ed = current; struct domain *d = ed->domain; - unsigned long l1e; + unsigned long l1e, nl1e, gpfn, gmfn; + unsigned gva = ed->arch.ldt_base + (off << PAGE_SHIFT); + int res; if ( unlikely(in_irq()) ) BUG(); - __get_user(l1e, (unsigned long *) - &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]); + shadow_sync_va(ed, gva); + __get_user(l1e, (unsigned long *)&linear_pg_table[l1_linear_offset(gva)]); + + if ( unlikely(!(l1e & _PAGE_PRESENT)) ) + return 0; + + gpfn = l1_pgentry_to_pfn(mk_l1_pgentry(l1e)); + gmfn = __gpfn_to_mfn(d, gpfn); + if ( unlikely(!gmfn) ) + return 0; + + if ( unlikely(shadow_mode_enabled(d)) ) + { + shadow_lock(d); + shadow_remove_all_write_access(d, PGT_l1_shadow, PGT_l1_shadow, gpfn); + } + + res = get_page_and_type(&frame_table[gmfn], d, PGT_ldt_page); + + if ( unlikely(shadow_mode_enabled(d)) ) + shadow_unlock(d); - if ( unlikely(!(l1e & _PAGE_PRESENT)) || - unlikely(!get_page_and_type( - &frame_table[l1_pgentry_to_pfn(mk_l1_pgentry(l1e))], - d, PGT_ldt_page)) ) + if ( unlikely(!res) ) return 0; - ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW); + nl1e = (l1e & ~PAGE_MASK) | (gmfn << PAGE_SHIFT) | _PAGE_RW; + + ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(nl1e); ed->arch.shadow_ldt_mapcnt++; return 1; @@ -337,6 +350,8 @@ get_linear_pagetable( struct pfn_info *page; unsigned long pfn; + ASSERT( !shadow_mode_enabled(d) ); + if ( (root_pgentry_val(re) & _PAGE_RW) ) { MEM_LOG("Attempt to create linear p.t. with write perms"); @@ -372,13 +387,13 @@ get_linear_pagetable( } -static int +int get_page_from_l1e( l1_pgentry_t l1e, struct domain *d) { unsigned long l1v = l1_pgentry_val(l1e); - unsigned long pfn = l1_pgentry_to_pfn(l1e); - struct pfn_info *page = &frame_table[pfn]; + unsigned long mfn = l1_pgentry_to_pfn(l1e); + struct pfn_info *page = &frame_table[mfn]; extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn); if ( !(l1v & _PAGE_PRESENT) ) @@ -386,11 +401,11 @@ get_page_from_l1e( if ( unlikely(l1v & L1_DISALLOW_MASK) ) { - MEM_LOG("Bad L1 type settings %p", l1v & L1_DISALLOW_MASK); + MEM_LOG("Bad L1 type settings %p %p", l1v, l1v & L1_DISALLOW_MASK); return 0; } - if ( unlikely(!pfn_is_ram(pfn)) ) + if ( unlikely(!pfn_is_ram(mfn)) ) { /* Revert to caller privileges if FD == DOMID_IO. */ if ( d == dom_io ) @@ -400,9 +415,9 @@ get_page_from_l1e( return 1; if ( IS_CAPABLE_PHYSDEV(d) ) - return domain_iomem_in_pfn(d, pfn); + return domain_iomem_in_pfn(d, mfn); - MEM_LOG("Non-privileged attempt to map I/O space %p", pfn); + MEM_LOG("Non-privileged attempt to map I/O space %p", mfn); return 0; } @@ -420,6 +435,8 @@ get_page_from_l2e( { int rc; + ASSERT( !shadow_mode_enabled(d) ); + if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) return 1; @@ -491,7 +508,7 @@ get_page_from_l4e( #endif /* __x86_64__ */ -static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) +void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) { unsigned long l1v = l1_pgentry_val(l1e); unsigned long pfn = l1_pgentry_to_pfn(l1e); @@ -530,6 +547,8 @@ static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == PGT_ldt_page)) && unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) ) + + // XXX SMP BUG? invalidate_shadow_ldt(e->exec_domain[0]); put_page(page); } @@ -575,6 +594,8 @@ static int alloc_l1_table(struct pfn_info *page) l1_pgentry_t *pl1e; int i; + ASSERT( !shadow_mode_enabled(d) ); + pl1e = map_domain_mem(pfn << PAGE_SHIFT); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) @@ -601,6 +622,11 @@ static int alloc_l2_table(struct pfn_info *page) unsigned long pfn = page_to_pfn(page); l2_pgentry_t *pl2e; int i; + + if ( (PGT_base_page_table == PGT_l2_page_table) && + shadow_mode_enabled(d) ) + return 1; + ASSERT( !shadow_mode_enabled(d) ); pl2e = map_domain_mem(pfn << PAGE_SHIFT); @@ -643,6 +669,8 @@ static int alloc_l3_table(struct pfn_info *page) l3_pgentry_t *pl3e = page_to_virt(page); int i; + ASSERT( !shadow_mode_enabled(d) ); + for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) if ( is_guest_l3_slot(i) && unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) ) @@ -666,6 +694,11 @@ static int alloc_l4_table(struct pfn_info *page) l4_pgentry_t *pl4e = page_to_virt(page); int i; + if ( (PGT_base_page_table == PGT_l4_page_table) && + shadow_mode_enabled(d) ) + return 1; + ASSERT( !shadow_mode_enabled(d) ); + for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) if ( is_guest_l4_slot(i) && unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) ) @@ -765,7 +798,7 @@ static inline int update_l1e(l1_pgentry_t *pl1e, if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) || unlikely(o != l1_pgentry_val(ol1e)) ) { - MEM_LOG("Failed to update %p -> %p: saw %p\n", + MEM_LOG("Failed to update %p -> %p: saw %p", l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o); return 0; } @@ -781,6 +814,8 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) unsigned long _ol1e; struct domain *d = current->domain; + ASSERT( !shadow_mode_enabled(d) ); + if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) ) return 0; ol1e = mk_l1_pgentry(_ol1e); @@ -807,13 +842,12 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) put_page_from_l1e(nl1e, d); return 0; } - - put_page_from_l1e(ol1e, d); - return 1; } - - if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) - return 0; + else + { + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) + return 0; + } put_page_from_l1e(ol1e, d); return 1; @@ -825,7 +859,7 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) _t ## _pgentry_val(_o), \ _t ## _pgentry_val(_n)); \ if ( __o != _t ## _pgentry_val(_o) ) \ - MEM_LOG("Failed to update %p -> %p: saw %p\n", \ + MEM_LOG("Failed to update %p -> %p: saw %p", \ _t ## _pgentry_val(_o), _t ## _pgentry_val(_n), __o); \ (__o == _t ## _pgentry_val(_o)); }) @@ -872,13 +906,12 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, put_page_from_l2e(nl2e, pfn); return 0; } - - put_page_from_l2e(ol2e, pfn); - return 1; } - - if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) ) - return 0; + else + { + if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) ) + return 0; + } put_page_from_l2e(ol2e, pfn); return 1; @@ -1025,7 +1058,9 @@ int alloc_page_type(struct pfn_info *page, unsigned int type) void free_page_type(struct pfn_info *page, unsigned int type) { - struct domain *d = page_get_owner(page); + struct domain *owner = page_get_owner(page); + if ( likely(owner != NULL) && unlikely(shadow_mode_enabled(owner)) ) + return; switch ( type ) { @@ -1050,13 +1085,6 @@ void free_page_type(struct pfn_info *page, unsigned int type) default: BUG(); } - - if ( unlikely(shadow_mode_enabled(d)) && - (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) ) - { - unshadow_table(page_to_pfn(page), type); - put_shadow_status(d); - } } @@ -1096,15 +1124,16 @@ void put_page_type(struct pfn_info *page) if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated)) != x) ) goto again; - /* We cleared the 'valid bit' so we do the clear up. */ + /* We cleared the 'valid bit' so we do the clean up. */ free_page_type(page, x & PGT_type_mask); /* Carry on, but with the 'valid bit' now clear. */ x &= ~PGT_validated; nx &= ~PGT_validated; } } - else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == - (PGT_pinned | 1)) ) + else if ( unlikely(((nx & (PGT_pinned | PGT_count_mask)) == + (PGT_pinned | 1)) && + ((nx & PGT_type_mask) != PGT_writable_page)) ) { /* Page is now only pinned. Make the back pointer mutable again. */ nx |= PGT_va_mutable; @@ -1124,7 +1153,7 @@ int get_page_type(struct pfn_info *page, u32 type) nx = x + 1; if ( unlikely((nx & PGT_count_mask) == 0) ) { - MEM_LOG("Type count overflow on pfn %p\n", page_to_pfn(page)); + MEM_LOG("Type count overflow on pfn %p", page_to_pfn(page)); return 0; } else if ( unlikely((x & PGT_count_mask) == 0) ) @@ -1137,6 +1166,8 @@ int get_page_type(struct pfn_info *page, u32 type) * circumstances should be very rare. */ struct domain *d = page_get_owner(page); + + // XXX SMP bug? if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]-> processor], page->tlbflush_timestamp)) ) @@ -1155,14 +1186,24 @@ int get_page_type(struct pfn_info *page, u32 type) nx |= PGT_validated; } } + else if ( unlikely(!(x & PGT_validated)) ) + { + /* Someone else is updating validation of this page. Wait... */ + while ( (y = page->u.inuse.type_info) == x ) + { + rep_nop(); + barrier(); + } + goto again; + } else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) ) { if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) ) { if ( ((x & PGT_type_mask) != PGT_l2_page_table) || ((type & PGT_type_mask) != PGT_l1_page_table) ) - MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %p\n", - x & PGT_type_mask, type, page_to_pfn(page)); + MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %p", + x, type, page_to_pfn(page)); return 0; } else if ( (x & PGT_va_mask) == PGT_va_mutable ) @@ -1178,16 +1219,6 @@ int get_page_type(struct pfn_info *page, u32 type) nx |= PGT_va_unknown; } } - else if ( unlikely(!(x & PGT_validated)) ) - { - /* Someone else is updating validation of this page. Wait... */ - while ( (y = page->u.inuse.type_info) == x ) - { - rep_nop(); - barrier(); - } - goto again; - } } while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); @@ -1197,7 +1228,7 @@ int get_page_type(struct pfn_info *page, u32 type) if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) ) { MEM_LOG("Error while validating pfn %p for type %08x." - " caf=%08x taf=%08x\n", + " caf=%08x taf=%08x", page_to_pfn(page), type, page->count_info, page->u.inuse.type_info); @@ -1214,30 +1245,36 @@ int get_page_type(struct pfn_info *page, u32 type) } -int new_guest_cr3(unsigned long pfn) +int new_guest_cr3(unsigned long mfn) { struct exec_domain *ed = current; struct domain *d = ed->domain; - int okay, cpu = smp_processor_id(); - unsigned long old_base_pfn; - - okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d); + int okay; + unsigned long old_base_mfn; + + if ( shadow_mode_enabled(d) ) + okay = get_page_from_pagenr(mfn, d); + else + okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d); + if ( likely(okay) ) { invalidate_shadow_ldt(ed); - percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB; - old_base_pfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT; - ed->arch.guest_table = mk_pagetable(pfn << PAGE_SHIFT); + old_base_mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT; + ed->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT); update_pagetables(ed); /* update shadow_table and monitor_table */ write_ptbase(ed); - put_page_and_type(&frame_table[old_base_pfn]); + if ( shadow_mode_enabled(d) ) + put_page(&frame_table[old_base_mfn]); + else + put_page_and_type(&frame_table[old_base_mfn]); } else { - MEM_LOG("Error while installing new baseptr %p", pfn); + MEM_LOG("Error while installing new baseptr %p", mfn); } return okay; @@ -1247,10 +1284,11 @@ static int do_extended_command(unsigned long ptr, unsigned long val) { int okay = 1, cpu = smp_processor_id(); unsigned int cmd = val & MMUEXT_CMD_MASK, type; - unsigned long pfn = ptr >> PAGE_SHIFT; - struct pfn_info *page = &frame_table[pfn]; struct exec_domain *ed = current; struct domain *d = ed->domain, *e; + unsigned long gpfn = ptr >> PAGE_SHIFT; + unsigned long mfn = __gpfn_to_mfn(d, gpfn); + struct pfn_info *page = &frame_table[mfn]; u32 x, y, _d, _nd; domid_t domid; grant_ref_t gntref; @@ -1266,17 +1304,29 @@ static int do_extended_command(unsigned long ptr, unsigned long val) type = PGT_l1_page_table | PGT_va_mutable; pin_page: - okay = get_page_and_type_from_pagenr(pfn, type, FOREIGNDOM); + if ( unlikely(percpu_info[cpu].foreign && + (shadow_mode_translate(d) || + shadow_mode_translate(percpu_info[cpu].foreign))) ) + { + // oops -- we should be using the foreign domain's P2M + mfn = __gpfn_to_mfn(FOREIGNDOM, gpfn); + page = &frame_table[mfn]; + } + + if ( shadow_mode_enabled(FOREIGNDOM) ) + type = PGT_writable_page; + + okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM); if ( unlikely(!okay) ) { - MEM_LOG("Error while pinning pfn %p", pfn); + MEM_LOG("Error while pinning mfn %p", mfn); break; } if ( unlikely(test_and_set_bit(_PGT_pinned, &page->u.inuse.type_info)) ) { - MEM_LOG("Pfn %p already pinned", pfn); + MEM_LOG("mfn %p already pinned", mfn); put_page_and_type(page); okay = 0; break; @@ -1299,10 +1349,19 @@ static int do_extended_command(unsigned long ptr, unsigned long val) #endif /* __x86_64__ */ case MMUEXT_UNPIN_TABLE: - if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) ) + if ( unlikely(percpu_info[cpu].foreign && + (shadow_mode_translate(d) || + shadow_mode_translate(percpu_info[cpu].foreign))) ) + { + // oops -- we should be using the foreign domain's P2M + mfn = __gpfn_to_mfn(FOREIGNDOM, gpfn); + page = &frame_table[mfn]; + } + + if ( unlikely(!(okay = get_page_from_pagenr(mfn, FOREIGNDOM))) ) { - MEM_LOG("Page %p bad domain (dom=%p)", - ptr, page_get_owner(page)); + MEM_LOG("mfn %p bad domain (dom=%p)", + mfn, page_get_owner(page)); } else if ( likely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) ) @@ -1314,28 +1373,29 @@ static int do_extended_command(unsigned long ptr, unsigned long val) { okay = 0; put_page(page); - MEM_LOG("Pfn %p not pinned", pfn); + MEM_LOG("mfn %p not pinned", mfn); } break; case MMUEXT_NEW_BASEPTR: - okay = new_guest_cr3(pfn); + okay = new_guest_cr3(mfn); + percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB; break; #ifdef __x86_64__ case MMUEXT_NEW_USER_BASEPTR: - okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d); + okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d); if ( unlikely(!okay) ) { - MEM_LOG("Error while installing new baseptr %p", pfn); + MEM_LOG("Error while installing new baseptr %p", mfn); } else { - unsigned long old_pfn = + unsigned long old_mfn = pagetable_val(ed->arch.guest_table_user) >> PAGE_SHIFT; - ed->arch.guest_table_user = mk_pagetable(pfn << PAGE_SHIFT); - if ( old_pfn != 0 ) - put_page_and_type(&frame_table[old_pfn]); + ed->arch.guest_table_user = mk_pagetable(mfn << PAGE_SHIFT); + if ( old_mfn != 0 ) + put_page_and_type(&frame_table[old_mfn]); } break; #endif @@ -1346,12 +1406,14 @@ static int do_extended_command(unsigned long ptr, unsigned long val) case MMUEXT_INVLPG: __flush_tlb_one(ptr); + if ( shadow_mode_enabled(d) ) + shadow_invlpg(ed, ptr); break; case MMUEXT_FLUSH_CACHE: if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) ) { - MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n"); + MEM_LOG("Non-physdev domain tried to FLUSH_CACHE."); okay = 0; } else @@ -1362,6 +1424,8 @@ static int do_extended_command(unsigned long ptr, unsigned long val) case MMUEXT_SET_LDT: { + ASSERT( !shadow_mode_external(d) ); + unsigned long ents = val >> MMUEXT_CMD_SHIFT; if ( ((ptr & (PAGE_SIZE-1)) != 0) || (ents > 8192) || @@ -1375,6 +1439,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) (ed->arch.ldt_base != ptr) ) { invalidate_shadow_ldt(ed); + shadow_sync_all(d); ed->arch.ldt_base = ptr; ed->arch.ldt_ents = ents; load_LDT(ed); @@ -1401,7 +1466,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) percpu_info[cpu].foreign = dom_io; break; default: - MEM_LOG("Dom %u cannot set foreign dom\n", d->id); + MEM_LOG("Dom %u cannot set foreign dom", d->id); okay = 0; break; } @@ -1435,10 +1500,10 @@ static int do_extended_command(unsigned long ptr, unsigned long val) gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF)); if ( unlikely(IS_XEN_HEAP_FRAME(page)) || - unlikely(!pfn_is_ram(pfn)) || + unlikely(!pfn_is_ram(mfn)) || unlikely((e = find_domain_by_id(domid)) == NULL) ) { - MEM_LOG("Bad frame (%p) or bad domid (%d).\n", pfn, domid); + MEM_LOG("Bad frame (%p) or bad domid (%d).", mfn, domid); okay = 0; break; } @@ -1460,7 +1525,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) unlikely(_nd != _d) ) { MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p," - " caf=%08x, taf=%08x\n", page_to_pfn(page), + " caf=%08x, taf=%08x", page_to_pfn(page), d, d->id, unpickle_domptr(_nd), x, page->u.inuse.type_info); spin_unlock(&d->page_alloc_lock); @@ -1496,7 +1561,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) ) { MEM_LOG("Transferee has no reservation headroom (%d,%d), or " - "provided a bad grant ref, or is dying (%p).\n", + "provided a bad grant ref, or is dying (%p).", e->tot_pages, e->max_pages, e->d_flags); spin_unlock(&e->page_alloc_lock); put_domain(e); @@ -1513,7 +1578,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) spin_unlock(&e->page_alloc_lock); /* Transfer is all done: tell the guest about its new page frame. */ - gnttab_notify_transfer(e, gntref, pfn); + gnttab_notify_transfer(e, gntref, mfn); put_domain(e); break; @@ -1529,7 +1594,14 @@ static int do_extended_command(unsigned long ptr, unsigned long val) e = percpu_info[cpu].foreign; if ( unlikely(e == NULL) ) { - MEM_LOG("No FOREIGNDOM to reassign pfn %p to", pfn); + MEM_LOG("No FOREIGNDOM to reassign mfn %p to", mfn); + okay = 0; + break; + } + + if ( unlikely(!pfn_is_ram(mfn)) ) + { + MEM_LOG("Can't reassign non-ram mfn %p", mfn); okay = 0; break; } @@ -1574,7 +1646,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val) unlikely(_nd != _d) ) { MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p," - " caf=%08x, taf=%08x\n", page_to_pfn(page), + " caf=%08x, taf=%08x", page_to_pfn(page), d, d->id, unpickle_domptr(_nd), x, page->u.inuse.type_info); okay = 0; @@ -1637,12 +1709,10 @@ int do_mmu_update( #define MMU_UPDATE_PREEMPT_FDOM_MASK (0x7FFFU<domain; u32 type_info; @@ -1653,10 +1723,9 @@ int do_mmu_update( cleanup_writable_pagetable(d); if ( unlikely(shadow_mode_enabled(d)) ) - check_pagetable(d, ed->arch.guest_table, "pre-mmu"); /* debug */ - - if ( unlikely(shadow_mode_translate(d) ) ) - domain_crash(); + { + check_pagetable(ed, "pre-mmu"); /* debug */ + } /* * If we are resuming after preemption, read how much work we have already @@ -1714,7 +1783,8 @@ int do_mmu_update( } cmd = req.ptr & (sizeof(l1_pgentry_t)-1); - pfn = req.ptr >> PAGE_SHIFT; + gpfn = req.ptr >> PAGE_SHIFT; + mfn = __gpfn_to_mfn(d, gpfn); okay = 0; @@ -1724,107 +1794,91 @@ int do_mmu_update( * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. */ case MMU_NORMAL_PT_UPDATE: - if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) ) + if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) ) { MEM_LOG("Could not get page for normal update"); break; } - if ( likely(prev_pfn == pfn) ) + if ( likely(prev_mfn == mfn) ) { va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK); } else { - if ( prev_pfn != 0 ) + if ( prev_mfn != 0 ) unmap_domain_mem((void *)va); va = (unsigned long)map_domain_mem(req.ptr); - prev_pfn = pfn; + prev_mfn = mfn; } - page = &frame_table[pfn]; + page = &frame_table[mfn]; switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask ) { case PGT_l1_page_table: + ASSERT(!shadow_mode_enabled(d)); if ( likely(get_page_type( page, type_info & (PGT_type_mask|PGT_va_mask))) ) { okay = mod_l1_entry((l1_pgentry_t *)va, - mk_l1_pgentry(req.val)); - - if ( unlikely(shadow_mode_enabled(d)) && okay && - (get_shadow_status(d, page-frame_table) & - PSH_shadowed) ) - { - shadow_l1_normal_pt_update( - req.ptr, req.val, &prev_smfn, &prev_spl1e); - put_shadow_status(d); - } - + mk_l1_pgentry(req.val)); put_page_type(page); } break; case PGT_l2_page_table: + ASSERT(!shadow_mode_enabled(d)); if ( likely(get_page_type(page, PGT_l2_page_table)) ) { okay = mod_l2_entry((l2_pgentry_t *)va, mk_l2_pgentry(req.val), - pfn); - - if ( unlikely(shadow_mode_enabled(d)) && okay && - (get_shadow_status(d, page-frame_table) & - PSH_shadowed) ) - { - shadow_l2_normal_pt_update(req.ptr, req.val); - put_shadow_status(d); - } - + mfn); put_page_type(page); } break; #ifdef __x86_64__ case PGT_l3_page_table: + ASSERT(!shadow_mode_enabled(d)); if ( likely(get_page_type(page, PGT_l3_page_table)) ) { okay = mod_l3_entry((l3_pgentry_t *)va, mk_l3_pgentry(req.val), - pfn); - - if ( unlikely(shadow_mode_enabled(d)) && okay && - (get_shadow_status(d, page-frame_table) & - PSH_shadowed) ) - { - /*XXXshadow_l3_normal_pt_update(req.ptr, req.val);*/ - put_shadow_status(d); - } - + mfn); put_page_type(page); } break; case PGT_l4_page_table: + ASSERT(!shadow_mode_enabled(d)); if ( likely(get_page_type(page, PGT_l4_page_table)) ) { okay = mod_l4_entry((l4_pgentry_t *)va, mk_l4_pgentry(req.val), - pfn); - - if ( unlikely(shadow_mode_enabled(d)) && okay && - (get_shadow_status(d, page-frame_table) & - PSH_shadowed) ) - { - /*XXXshadow_l4_normal_pt_update(req.ptr, req.val);*/ - put_shadow_status(d); - } - + mfn); put_page_type(page); } break; #endif /* __x86_64__ */ default: + printk("do_mmu_update writable update: ma=%p val=%p\n", + req.ptr, req.val); if ( likely(get_page_type(page, PGT_writable_page)) ) { + if ( shadow_mode_enabled(d) ) + { + shadow_lock(d); + + if ( shadow_mode_log_dirty(d) ) + __mark_dirty(d, mfn); + + if ( page_is_page_table(page) ) + shadow_mark_mfn_out_of_sync(ed, gpfn, mfn); + } + *(unsigned long *)va = req.val; okay = 1; + + if ( shadow_mode_enabled(d) ) + shadow_unlock(d); + put_page_type(page); } break; @@ -1834,24 +1888,30 @@ int do_mmu_update( break; case MMU_MACHPHYS_UPDATE: - if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) ) + if ( unlikely(!get_page_from_pagenr(mfn, FOREIGNDOM)) ) { MEM_LOG("Could not get page for mach->phys update"); break; } - machine_to_phys_mapping[pfn] = req.val; + if ( unlikely(shadow_mode_translate(FOREIGNDOM) && !IS_PRIV(d)) ) + { + MEM_LOG("can't mutate the m2p of translated guests"); + break; + } + + set_machinetophys(mfn, req.val); okay = 1; /* - * If in log-dirty mode, mark the corresponding pseudo-physical + * If in log-dirty mode, mark the corresponding * page as dirty. */ - if ( unlikely(shadow_mode_log_dirty(d)) && - mark_dirty(d, pfn) ) - d->arch.shadow_dirty_block_count++; + if ( unlikely(shadow_mode_log_dirty(FOREIGNDOM)) && + mark_dirty(FOREIGNDOM, mfn) ) + FOREIGNDOM->arch.shadow_dirty_block_count++; - put_page(&frame_table[pfn]); + put_page(&frame_table[mfn]); break; /* @@ -1878,17 +1938,18 @@ int do_mmu_update( } out: - if ( prev_pfn != 0 ) + if ( prev_mfn != 0 ) unmap_domain_mem((void *)va); - if ( unlikely(prev_spl1e != 0) ) - unmap_domain_mem((void *)prev_spl1e); - deferred_ops = percpu_info[cpu].deferred_ops; percpu_info[cpu].deferred_ops = 0; if ( deferred_ops & DOP_FLUSH_TLB ) + { local_flush_tlb(); + if ( shadow_mode_enabled(d) ) + shadow_sync_all(d); + } if ( deferred_ops & DOP_RELOAD_LDT ) (void)map_ldt_shadow_page(0); @@ -1904,7 +1965,7 @@ int do_mmu_update( __put_user(done + i, pdone); if ( unlikely(shadow_mode_enabled(d)) ) - check_pagetable(d, ed->arch.guest_table, "post-mmu"); /* debug */ + check_pagetable(ed, "post-mmu"); /* debug */ UNLOCK_BIGLOCK(d); return rc; @@ -1923,12 +1984,9 @@ int do_update_va_mapping(unsigned long va, perfc_incrc(calls_to_update_va); - if ( unlikely(!__addr_ok(va)) ) + if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) ) return -EINVAL; - if ( unlikely(shadow_mode_translate(d) ) ) - domain_crash(); - LOCK_BIGLOCK(d); cleanup_writable_pagetable(d); @@ -1937,55 +1995,56 @@ int do_update_va_mapping(unsigned long va, * XXX When we make this support 4MB superpages we should also deal with * the case of updating L2 entries. */ - - if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)], - mk_l1_pgentry(val))) ) - err = -EINVAL; - - if ( unlikely(shadow_mode_enabled(d)) ) + if ( likely(!shadow_mode_enabled(d)) ) { - unsigned long sval = 0; + if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)], + mk_l1_pgentry(val))) ) + err = -EINVAL; + } + else + { + if ( unlikely(percpu_info[cpu].foreign && + (shadow_mode_translate(d) || + shadow_mode_translate(percpu_info[cpu].foreign))) ) + { + // The foreign domain's pfn's are in a different namespace. + // We wouldn't be able to figure out how to (re-)shadow our + // gpte without additional context. + // + domain_crash(); + } + + check_pagetable(ed, "pre-va"); /* debug */ + shadow_lock(d); + + // This is actually overkill - we don't need to sync the L1 itself, + // just everything involved in getting to this L1 (i.e. we need + // linear_pg_table[l1_linear_offset(va)] to be in sync)... + // + __shadow_sync_va(ed, va); + + if ( unlikely(__put_user(val, &l1_pgentry_val( + linear_pg_table[l1_linear_offset(va)]))) ) + err = -EINVAL; + else + { + // also need to update the shadow + unsigned long spte; - l1pte_propagate_from_guest(d, &val, &sval); + l1pte_propagate_from_guest(d, val, &spte); + shadow_set_l1e(va, spte, 0); - if ( unlikely(__put_user(sval, ((unsigned long *)( - &shadow_linear_pg_table[l1_linear_offset(va)])))) ) - { /* - * Since L2's are guranteed RW, failure indicates either that the - * page was not shadowed, or that the L2 entry has not yet been - * updated to reflect the shadow. + * If we're in log-dirty mode then we need to note that we've updated + * the PTE in the PT-holding page. We need the machine frame number + * for this. */ - if ( shadow_mode_external(current->domain) ) - BUG(); // can't use linear_l2_table with external tables. + if ( shadow_mode_log_dirty(d) ) + mark_dirty(d, va_to_l1mfn(ed, va)); - l2_pgentry_t gpde = linear_l2_table[l2_table_offset(va)]; - unsigned long gpfn = l2_pgentry_val(gpde) >> PAGE_SHIFT; - - if (get_shadow_status(d, gpfn)) - { - unsigned long gmfn = __gpfn_to_mfn(d, gpfn); - unsigned long *gl1e = map_domain_mem(gmfn << PAGE_SHIFT); - unsigned l1_idx = l1_table_offset(va); - gl1e[l1_idx] = sval; - unmap_domain_mem(gl1e); - put_shadow_status(d); - - perfc_incrc(shadow_update_va_fail1); - } - else - perfc_incrc(shadow_update_va_fail2); + shadow_unlock(d); + check_pagetable(ed, "post-va"); /* debug */ } - - /* - * If we're in log-dirty mode then we need to note that we've updated - * the PTE in the PT-holding page. We need the machine frame number - * for this. - */ - if ( shadow_mode_log_dirty(d) ) - mark_dirty(d, va_to_l1mfn(va)); - - check_pagetable(d, ed->arch.guest_table, "va"); /* debug */ } deferred_ops = percpu_info[cpu].deferred_ops; @@ -1993,9 +2052,17 @@ int do_update_va_mapping(unsigned long va, if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || unlikely(flags & UVMF_FLUSH_TLB) ) + { local_flush_tlb(); + if ( unlikely(shadow_mode_enabled(d)) ) + shadow_sync_all(d); + } else if ( unlikely(flags & UVMF_INVLPG) ) + { __flush_tlb_one(va); + if ( unlikely(shadow_mode_enabled(d)) ) + shadow_invlpg(current, va); + } if ( unlikely(deferred_ops & DOP_RELOAD_LDT) ) (void)map_ldt_shadow_page(0); @@ -2066,6 +2133,8 @@ long set_gdt(struct exec_domain *ed, if ( (pfn = frames[0]) >= max_page ) goto fail; + shadow_sync_all(d); + /* The first page is special because Xen owns a range of entries in it. */ if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) { @@ -2145,7 +2214,9 @@ long do_set_gdt(unsigned long *frame_list, unsigned int entries) long do_update_descriptor( unsigned long pa, unsigned long word1, unsigned long word2) { - unsigned long pfn = pa >> PAGE_SHIFT; + struct domain *dom = current->domain; + unsigned long gpfn = pa >> PAGE_SHIFT; + unsigned long mfn; struct desc_struct *gdt_pent, d; struct pfn_info *page; struct exec_domain *ed; @@ -2154,16 +2225,21 @@ long do_update_descriptor( d.a = (u32)word1; d.b = (u32)word2; - LOCK_BIGLOCK(current->domain); + LOCK_BIGLOCK(dom); - if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) { - UNLOCK_BIGLOCK(current->domain); + if ( !(mfn = __gpfn_to_mfn(dom, gpfn)) ) { + UNLOCK_BIGLOCK(dom); return -EINVAL; } - page = &frame_table[pfn]; - if ( unlikely(!get_page(page, current->domain)) ) { - UNLOCK_BIGLOCK(current->domain); + if ( (pa & 7) || (mfn >= max_page) || !check_descriptor(&d) ) { + UNLOCK_BIGLOCK(dom); + return -EINVAL; + } + + page = &frame_table[mfn]; + if ( unlikely(!get_page(page, dom)) ) { + UNLOCK_BIGLOCK(dom); return -EINVAL; } @@ -2172,8 +2248,8 @@ long do_update_descriptor( { case PGT_gdt_page: /* Disallow updates of Xen-reserved descriptors in the current GDT. */ - for_each_exec_domain(current->domain, ed) { - if ( (l1_pgentry_to_pfn(ed->arch.perdomain_ptes[0]) == pfn) && + for_each_exec_domain(dom, ed) { + if ( (l1_pgentry_to_pfn(ed->arch.perdomain_ptes[0]) == mfn) && (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) && (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) ) goto out; @@ -2191,11 +2267,25 @@ long do_update_descriptor( break; } + if ( shadow_mode_enabled(dom) ) + { + shadow_lock(dom); + + if ( shadow_mode_log_dirty(dom) ) + __mark_dirty(dom, mfn); + + if ( page_is_page_table(page) ) + shadow_mark_mfn_out_of_sync(current, gpfn, mfn); + } + /* All is good so make the update. */ - gdt_pent = map_domain_mem(pa); + gdt_pent = map_domain_mem((mfn << PAGE_SHIFT) | (pa & ~PAGE_MASK)); memcpy(gdt_pent, &d, 8); unmap_domain_mem(gdt_pent); + if ( shadow_mode_enabled(dom) ) + shadow_unlock(dom); + put_page_type(page); ret = 0; /* success */ @@ -2203,7 +2293,7 @@ long do_update_descriptor( out: put_page(page); - UNLOCK_BIGLOCK(current->domain); + UNLOCK_BIGLOCK(dom); return ret; } @@ -2228,13 +2318,16 @@ int ptwr_debug = 0x0; /* Flush the given writable p.t. page and write-protect it again. */ void ptwr_flush(const int which) { - unsigned long sstat, spte, pte, *ptep, l1va; - l1_pgentry_t *sl1e = NULL, *pl1e, ol1e, nl1e; + unsigned long pte, *ptep, l1va; + l1_pgentry_t *pl1e, ol1e, nl1e; l2_pgentry_t *pl2e; int i, cpu = smp_processor_id(); struct exec_domain *ed = current; struct domain *d = ed->domain; + // not supported in combination with various shadow modes! + ASSERT( !shadow_mode_enabled(d) ); + l1va = ptwr_info[cpu].ptinfo[which].l1va; ptep = (unsigned long *)&linear_pg_table[l1_linear_offset(l1va)]; @@ -2244,7 +2337,7 @@ void ptwr_flush(const int which) if ( unlikely(__get_user(pte, ptep)) ) { - MEM_LOG("ptwr: Could not read pte at %p\n", ptep); + MEM_LOG("ptwr: Could not read pte at %p", ptep); /* * Really a bug. We could read this PTE during the initial fault, * and pagetables can't have changed meantime. XXX Multi-CPU guests? @@ -2255,23 +2348,10 @@ void ptwr_flush(const int which) PTWR_PRINT_WHICH, ptep, pte); pte &= ~_PAGE_RW; - if ( unlikely(shadow_mode_enabled(d)) ) - { - /* Write-protect the p.t. page in the shadow page table. */ - l1pte_propagate_from_guest(d, &pte, &spte); - __put_user(spte, (unsigned long *) - &shadow_linear_pg_table[l1_linear_offset(l1va)]); - - /* Is the p.t. page itself shadowed? Map it into Xen space if so. */ - sstat = get_shadow_status(d, pte >> PAGE_SHIFT); - if ( sstat & PSH_shadowed ) - sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT); - } - /* Write-protect the p.t. page in the guest page table. */ if ( unlikely(__put_user(pte, ptep)) ) { - MEM_LOG("ptwr: Could not update pte at %p\n", ptep); + MEM_LOG("ptwr: Could not update pte at %p", ptep); /* * Really a bug. We could write this PTE during the initial fault, * and pagetables can't have changed meantime. XXX Multi-CPU guests? @@ -2309,13 +2389,7 @@ void ptwr_flush(const int which) if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) ) { if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) ) - { - if ( unlikely(sl1e != NULL) ) - l1pte_propagate_from_guest( - d, &l1_pgentry_val(nl1e), - &l1_pgentry_val(sl1e[i])); put_page_type(&frame_table[l1_pgentry_to_pfn(nl1e)]); - } continue; } @@ -2334,22 +2408,19 @@ void ptwr_flush(const int which) domain_crash(); } - if ( unlikely(sl1e != NULL) ) - l1pte_propagate_from_guest( - d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i])); - if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) ) put_page_from_l1e(ol1e, d); } + unmap_domain_mem(pl1e); /* * STEP 3. Reattach the L1 p.t. page into the current address space. */ - if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode_enabled(d)) ) + if ( which == PTWR_PT_ACTIVE ) { - pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx]; + pl2e = &linear_l2_table(ed)[ptwr_info[cpu].ptinfo[which].l2_idx]; *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); } @@ -2358,23 +2429,21 @@ void ptwr_flush(const int which) */ ptwr_info[cpu].ptinfo[which].l1va = 0; - - if ( unlikely(sl1e != NULL) ) - { - unmap_domain_mem(sl1e); - put_shadow_status(d); - } } /* Write page fault handler: check if guest is trying to modify a PTE. */ int ptwr_do_page_fault(unsigned long addr) { + struct exec_domain *ed = current; unsigned long pte, pfn, l2e; struct pfn_info *page; l2_pgentry_t *pl2e; int which, cpu = smp_processor_id(); u32 l2_idx; + // not supported in combination with various shadow modes! + ASSERT( !shadow_mode_enabled(ed->domain) ); + #ifdef __x86_64__ return 0; /* Writable pagetables need fixing for x86_64. */ #endif @@ -2383,10 +2452,7 @@ int ptwr_do_page_fault(unsigned long addr) * Attempt to read the PTE that maps the VA being accessed. By checking for * PDE validity in the L2 we avoid many expensive fixups in __get_user(). */ - if ( shadow_mode_external(current->domain) ) - BUG(); // can't use linear_l2_table with external tables. - - if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) & + if ( !(l2_pgentry_val(linear_l2_table(ed)[addr>>L2_PAGETABLE_SHIFT]) & _PAGE_PRESENT) || __get_user(pte, (unsigned long *) &linear_pg_table[l1_linear_offset(addr)]) ) @@ -2414,7 +2480,7 @@ int ptwr_do_page_fault(unsigned long addr) if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) ) { - MEM_LOG("PTWR failure! Pagetable maps itself at %p\n", addr); + MEM_LOG("PTWR failure! Pagetable maps itself at %p", addr); domain_crash(); } @@ -2422,10 +2488,7 @@ int ptwr_do_page_fault(unsigned long addr) * Is the L1 p.t. mapped into the current address space? If so we call it * an ACTIVE p.t., otherwise it is INACTIVE. */ - if ( shadow_mode_external(current->domain) ) - BUG(); // can't use linear_l2_table with external tables. - - pl2e = &linear_l2_table[l2_idx]; + pl2e = &linear_l2_table(ed)[l2_idx]; l2e = l2_pgentry_val(*pl2e); which = PTWR_PT_INACTIVE; if ( (l2e >> PAGE_SHIFT) == pfn ) @@ -2461,8 +2524,7 @@ int ptwr_do_page_fault(unsigned long addr) ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx; /* For safety, disconnect the L1 p.t. page from current space. */ - if ( (which == PTWR_PT_ACTIVE) && - likely(!shadow_mode_enabled(current->domain)) ) + if ( which == PTWR_PT_ACTIVE ) { *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT); #if 1 @@ -2485,7 +2547,7 @@ int ptwr_do_page_fault(unsigned long addr) if ( unlikely(__put_user(pte, (unsigned long *) &linear_pg_table[addr>>PAGE_SHIFT])) ) { - MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *) + MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *) &linear_pg_table[addr>>PAGE_SHIFT]); /* Toss the writable pagetable state and crash. */ unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e); @@ -2531,7 +2593,7 @@ void ptwr_status(void) [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT]; if ( __get_user(pte, ptep) ) { - MEM_LOG("ptwr: Could not read pte at %p\n", ptep); + MEM_LOG("ptwr: Could not read pte at %p", ptep); domain_crash(); } @@ -2547,7 +2609,7 @@ void ptwr_status(void) if ( __get_user(pte, (unsigned long *) ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) { - MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *) + MEM_LOG("ptwr: Could not read pte at %p", (unsigned long *) ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va); domain_crash(); } @@ -2555,433 +2617,6 @@ void ptwr_status(void) page = &frame_table[pfn]; } -void audit_domain(struct domain *d) -{ - int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0; - - void adjust (struct pfn_info *page, int dir, int adjtype) - { - int count = page->count_info & PGC_count_mask; - - if ( adjtype ) - { - int tcount = page->u.inuse.type_info & PGT_count_mask; - - ttot++; - - tcount += dir; - - if ( tcount < 0 ) - { - /* This will only come out once. */ - printk("Audit %d: type count whent below zero pfn=%x " - "taf=%x otaf=%x\n", - d->id, page-frame_table, - page->u.inuse.type_info, - page->tlbflush_timestamp); - } - - page->u.inuse.type_info = - (page->u.inuse.type_info & ~PGT_count_mask) | - (tcount & PGT_count_mask); - } - - ctot++; - count += dir; - if ( count < 0 ) - { - /* This will only come out once. */ - printk("Audit %d: general count whent below zero pfn=%x " - "taf=%x otaf=%x\n", - d->id, page-frame_table, - page->u.inuse.type_info, - page->tlbflush_timestamp); - } - - page->count_info = - (page->count_info & ~PGC_count_mask) | - (count & PGC_count_mask); - - } - - void scan_for_pfn(struct domain *d, unsigned long xpfn) - { - unsigned long pfn, *pt; - struct list_head *list_ent; - struct pfn_info *page; - int i; - - list_ent = d->page_list.next; - for ( i = 0; (list_ent != &d->page_list); i++ ) - { - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; - page = &frame_table[pfn]; - - switch ( page->u.inuse.type_info & PGT_type_mask ) - { - case PGT_l1_page_table: - case PGT_l2_page_table: - pt = map_domain_mem(pfn<> PAGE_SHIFT) == xpfn) ) - printk(" found dom=%d i=%x pfn=%lx t=%x c=%x\n", - d->id, i, pfn, page->u.inuse.type_info, - page->count_info); - unmap_domain_mem(pt); - } - - list_ent = frame_table[pfn].list.next; - } - - } - - void scan_for_pfn_remote(unsigned long xpfn) - { - struct domain *e; - for_each_domain ( e ) - scan_for_pfn( e, xpfn ); - } - - int i, l1, l2; - unsigned long pfn; - struct list_head *list_ent; - struct pfn_info *page; - - if ( d != current->domain ) - domain_pause(d); - synchronise_pagetables(~0UL); - - printk("pt base=%lx sh_info=%x\n", - pagetable_val(d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT, - virt_to_page(d->shared_info)-frame_table); - - spin_lock(&d->page_alloc_lock); - - /* PHASE 0 */ - - list_ent = d->page_list.next; - for ( i = 0; (list_ent != &d->page_list); i++ ) - { - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; - page = &frame_table[pfn]; - - if ( page_get_owner(page) != d ) - BUG(); - - if ( (page->u.inuse.type_info & PGT_count_mask) > - (page->count_info & PGC_count_mask) ) - printk("taf > caf %x %x pfn=%lx\n", - page->u.inuse.type_info, page->count_info, pfn ); - -#if 0 /* SYSV shared memory pages plus writeable files. */ - if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && - (page->u.inuse.type_info & PGT_count_mask) > 1 ) - { - printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n", - pfn, - page->u.inuse.type_info, - page->count_info ); - scan_for_pfn_remote(pfn); - } -#endif - if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && - (page->u.inuse.type_info & PGT_count_mask) > 1 ) - { - printk("normal page with type count >1: pfn=%lx t=%x c=%x\n", - pfn, - page->u.inuse.type_info, - page->count_info ); - } - - /* Use tlbflush_timestamp to store original type_info. */ - page->tlbflush_timestamp = page->u.inuse.type_info; - - list_ent = frame_table[pfn].list.next; - } - - - /* PHASE 1 */ - if ( pagetable_val(d->exec_domain[0]->arch.guest_table) ) - adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.guest_table) - >>PAGE_SHIFT], -1, 1); - - list_ent = d->page_list.next; - for ( i = 0; (list_ent != &d->page_list); i++ ) - { - unsigned long *pt; - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; - page = &frame_table[pfn]; - - if ( page_get_owner(page) != d ) - BUG(); - - switch ( page->u.inuse.type_info & PGT_type_mask ) - { - case PGT_l2_page_table: - - if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) - printk("Audit %d: L2 not validated %x\n", - d->id, page->u.inuse.type_info); - - if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) - printk("Audit %d: L2 not pinned %x\n", - d->id, page->u.inuse.type_info); - else - adjust( page, -1, 1 ); - - pt = map_domain_mem( pfn<>PAGE_SHIFT; - struct pfn_info *l1page = &frame_table[l1pfn]; - - if ( page_get_owner(l1page) != d ) - { - printk("L2: Skip bizarre page belonging to other " - "dom %p\n", page_get_owner(l1page)); - continue; - } - - if ( (l1page->u.inuse.type_info & PGT_type_mask) == - PGT_l2_page_table ) - printk("Audit %d: [%x] Found %s Linear PT " - "t=%x pfn=%lx\n", d->id, i, - (l1pfn==pfn) ? "Self" : "Other", - l1page->u.inuse.type_info, - l1pfn); - else if ( (l1page->u.inuse.type_info & PGT_type_mask) != - PGT_l1_page_table ) - printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n", - d->id, i, - l1page->u.inuse.type_info, - l1pfn); - - adjust(l1page, -1, 1); - } - } - - unmap_domain_mem(pt); - - break; - - - case PGT_l1_page_table: - - if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) - adjust( page, -1, 1 ); - - if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) - printk("Audit %d: L1 not validated %x\n", - d->id, page->u.inuse.type_info); -#if 0 - if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) - printk("Audit %d: L1 not pinned %x\n", - d->id, page->u.inuse.type_info); -#endif - pt = map_domain_mem( pfn<>PAGE_SHIFT; - struct pfn_info *l1page = &frame_table[l1pfn]; - - if ( l1pfn < 0x100 ) - { - lowmem_mappings++; - continue; - } - - if ( l1pfn > max_page ) - { - io_mappings++; - continue; - } - - if ( pt[i] & _PAGE_RW ) - { - - if ( (l1page->u.inuse.type_info & PGT_type_mask) == - PGT_l1_page_table || - (l1page->u.inuse.type_info & PGT_type_mask) == - PGT_l2_page_table ) - printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n", - d->id, i, - l1page->u.inuse.type_info, - l1pfn); - - } - - if ( page_get_owner(l1page) != d ) - { - printk("Audit %d: [%lx,%x] Skip foreign page dom=%p " - "pfn=%lx c=%08x t=%08x m2p=%lx\n", - d->id, pfn, i, - page_get_owner(l1page), - l1pfn, - l1page->count_info, - l1page->u.inuse.type_info, - machine_to_phys_mapping[l1pfn]); - continue; - } - - adjust(l1page, -1, 0); - } - } - - unmap_domain_mem(pt); - - break; - } - - list_ent = frame_table[pfn].list.next; - } - - if ( (io_mappings > 0) || (lowmem_mappings > 0) ) - printk("Audit %d: Found %d lowmem mappings and %d io mappings\n", - d->id, lowmem_mappings, io_mappings); - - /* PHASE 2 */ - - ctot = ttot = 0; - list_ent = d->page_list.next; - for ( i = 0; (list_ent != &d->page_list); i++ ) - { - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; - page = &frame_table[pfn]; - - switch ( page->u.inuse.type_info & PGT_type_mask) - { - case PGT_l1_page_table: - case PGT_l2_page_table: - if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) - { - printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n", - d->id, page->u.inuse.type_info, - page->tlbflush_timestamp, - page->count_info, pfn ); - scan_for_pfn_remote(pfn); - } - default: - if ( (page->count_info & PGC_count_mask) != 1 ) - { - printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n", - d->id, - page->count_info, - page->u.inuse.type_info, - page->tlbflush_timestamp, pfn ); - scan_for_pfn_remote(pfn); - } - break; - } - - list_ent = frame_table[pfn].list.next; - } - - /* PHASE 3 */ - list_ent = d->page_list.next; - l1 = l2 = 0; - for ( i = 0; (list_ent != &d->page_list); i++ ) - { - unsigned long *pt; - pfn = list_entry(list_ent, struct pfn_info, list) - frame_table; - page = &frame_table[pfn]; - - switch ( page->u.inuse.type_info & PGT_type_mask ) - { - case PGT_l2_page_table: - l2++; - if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) - adjust( page, 1, 1 ); - - pt = map_domain_mem( pfn<>PAGE_SHIFT; - struct pfn_info *l1page; - - if (l1pfn>max_page) - continue; - - l1page = &frame_table[l1pfn]; - - if ( page_get_owner(l1page) == d ) - adjust(l1page, 1, 1); - } - } - - unmap_domain_mem(pt); - break; - - case PGT_l1_page_table: - l1++; - if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned ) - adjust( page, 1, 1 ); - - pt = map_domain_mem( pfn<>PAGE_SHIFT; - struct pfn_info *l1page; - - if (l1pfn>max_page) - continue; - - l1page = &frame_table[l1pfn]; - - if ( (page_get_owner(l1page) != d) || - (l1pfn < 0x100) || (l1pfn > max_page) ) - continue; - - adjust(l1page, 1, 0); - } - } - - unmap_domain_mem(pt); - break; - } - - - page->tlbflush_timestamp = 0; - - list_ent = frame_table[pfn].list.next; - } - - spin_unlock(&d->page_alloc_lock); - - if ( pagetable_val(d->exec_domain[0]->arch.guest_table) ) - adjust(&frame_table[pagetable_val( - d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT], 1, 1); - - printk("Audit %d: Done. pages=%d l1=%d l2=%d ctot=%d ttot=%d\n", d->id, i, l1, l2, ctot, ttot ); - - if ( d != current->domain ) - domain_unpause(d); -} - -void audit_domains(void) -{ - struct domain *d; - for_each_domain ( d ) - audit_domain(d); -} - -void audit_domains_key(unsigned char key) -{ - audit_domains(); -} - #endif /* NDEBUG */ /* diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c index 1ac97c6da6..b32438f497 100644 --- a/xen/arch/x86/shadow.c +++ b/xen/arch/x86/shadow.c @@ -1,3 +1,23 @@ +/****************************************************************************** + * arch/x86/shadow.c + * + * Copyright (c) 2005 Michael A Fetterman + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + #include #include @@ -8,6 +28,10 @@ #include #include +static void shadow_free_snapshot(struct domain *d, + struct out_of_sync_entry *entry); +static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn); + /******** There's a per-domain shadow table spin lock which works fine for SMP @@ -20,34 +44,401 @@ hypercall lock anyhow (at least initially). ********/ -static inline void free_shadow_page( - struct domain *d, struct pfn_info *page) +static inline int +shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn, + unsigned long new_type) { - d->arch.shadow_page_count--; + unsigned long min_type, max_type; + struct pfn_info *page = pfn_to_page(gmfn); + int pinned = 0, okay = 1; + + if ( page_out_of_sync(page) ) + { + // Don't know how long ago this snapshot was taken. + // Can't trust it to be recent enough. + // + __shadow_sync_mfn(d, gmfn); + } + + if ( unlikely(mfn_is_page_table(gmfn)) ) + { + min_type = shadow_max_pgtable_type(d, gpfn) + PGT_l1_shadow; + max_type = new_type; + } + else + { + min_type = PGT_l1_shadow; + max_type = PGT_l1_shadow; + } + FSH_LOG("shadow_promote gpfn=%p gmfn=%p nt=%p min=%p max=%p\n", + gmfn, gmfn, new_type, min_type, max_type); + + if ( min_type <= max_type ) + shadow_remove_all_write_access(d, min_type, max_type, gpfn); + + // To convert this page to use as a page table, the writable count + // should now be zero. Test this by grabbing the page as an page table, + // and then immediately releasing. This will also deal with any + // necessary TLB flushing issues for us. + // + // The cruft here about pinning doesn't really work right. This + // needs rethinking/rewriting... Need to gracefully deal with the + // TLB flushes required when promoting a writable page, and also deal + // with any outstanding (external) writable refs to this page (by + // refusing to promote it). The pinning headache complicates this + // code -- it would all much get simpler if we stop using + // shadow_lock() and move the shadow code to BIGLOCK(). + // + if ( unlikely(!get_page(page, d)) ) + BUG(); + if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) ) + { + pinned = 1; + put_page_and_type(page); + } + if ( get_page_type(page, PGT_base_page_table) ) + { + put_page_type(page); + set_bit(_PGC_page_table, &frame_table[gmfn].count_info); + } + else + { + printk("shadow_promote: get_page_type failed " + "dom%d gpfn=%p gmfn=%p t=%x\n", + d->id, gpfn, gmfn, new_type); + okay = 0; + } + + // Now put the type back to writable... + if ( unlikely(!get_page_type(page, PGT_writable_page)) ) + BUG(); + if ( unlikely(pinned) ) + { + if ( unlikely(test_and_set_bit(_PGT_pinned, + &page->u.inuse.type_info)) ) + BUG(); // hmm... someone pinned this again? + } + else + put_page_and_type(page); + + return okay; +} + +static inline void +shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn) +{ + ASSERT(frame_table[gmfn].count_info & PGC_page_table); + + if ( shadow_max_pgtable_type(d, gpfn) == PGT_none ) + { + clear_bit(_PGC_page_table, &frame_table[gmfn].count_info); + + if ( page_out_of_sync(pfn_to_page(gmfn)) ) + { + remove_out_of_sync_entries(d, gmfn); + } + } +} + +/* + * Things in shadow mode that collect get_page() refs to the domain's + * pages are: + * - PGC_allocated takes a gen count, just like normal. + * - A writable page can be pinned (paravirtualized guests may consider + * these pages to be L1s or L2s, and don't know the difference). + * Pinning a page takes a gen count (but, for domains in shadow mode, + * it *doesn't* take a type count) + * - CR3 grabs a ref to whatever it points at, just like normal. + * - Shadow mode grabs an initial gen count for itself, as a placehold + * for whatever references will exist. + * - Shadow PTEs that point to a page take a gen count, just like regular + * PTEs. However, they don't get a type count, as get_page_type() is + * hardwired to keep writable pages' counts at 1 for domains in shadow + * mode. + * - Whenever we shadow a page, the entry in the shadow hash grabs a + * general ref to the page. + * - Whenever a page goes out of sync, the out of sync entry grabs a + * general ref to the page. + */ +/* + * pfn_info fields for pages allocated as shadow pages: + * + * All 32 bits of count_info are a simple count of refs to this shadow + * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table), + * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync + * references. + * + * u.inuse._domain is left NULL, to prevent accidently allow some random + * domain from gaining permissions to map this page. + * + * u.inuse.type_info & PGT_type_mask remembers what kind of page is being + * shadowed. + * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed. + * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow + * is currently exists because this is a shadow of a root page, and we + * don't want to let those disappear just because no CR3 is currently pointing + * at it. + * + * tlbflush_timestamp holds a pickled pointer to the domain. + */ + +static inline unsigned long +alloc_shadow_page(struct domain *d, + unsigned long gpfn, unsigned long gmfn, + u32 psh_type) +{ + struct pfn_info *page; + unsigned long smfn; + int pin = 0; + + if ( (psh_type != PGT_snapshot) && + !shadow_promote(d, gpfn, gmfn, psh_type) ) + { + FSH_LOG("promotion of pfn=%p mfn=%p failed! external gnttab refs?\n", + gpfn, gmfn); + return 0; + } + + page = alloc_domheap_page(NULL); + if ( unlikely(page == NULL) ) + { + printk("Couldn't alloc shadow page! dom%d count=%d\n", + d->id, d->arch.shadow_page_count); + printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n", + perfc_value(shadow_l1_pages), + perfc_value(shadow_l2_pages), + perfc_value(hl2_table_pages), + perfc_value(snapshot_pages)); + BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */ + } + + smfn = page_to_pfn(page); + + ASSERT( (gmfn & ~PGT_mfn_mask) == 0 ); + page->u.inuse.type_info = psh_type | gmfn; + page->count_info = 0; + page->tlbflush_timestamp = pickle_domptr(d); + + switch ( psh_type ) + { + case PGT_l1_shadow: + perfc_incr(shadow_l1_pages); + d->arch.shadow_page_count++; + break; - switch ( page->u.inuse.type_info & PGT_type_mask ) + case PGT_l2_shadow: + perfc_incr(shadow_l2_pages); + d->arch.shadow_page_count++; + if ( PGT_l2_page_table == PGT_root_page_table ) + pin = 1; + + break; + + case PGT_hl2_shadow: + perfc_incr(hl2_table_pages); + d->arch.hl2_page_count++; + + // treat an hl2 as an L1 for purposes of promotion, + // and as an L2 for purposes of pinning. + // + if ( PGT_l2_page_table == PGT_root_page_table ) + pin = 1; + + break; + + case PGT_snapshot: + perfc_incr(snapshot_pages); + d->arch.snapshot_page_count++; + break; + + default: + printk("Alloc shadow weird page type type=%08x\n", psh_type); + BUG(); + break; + } + + set_shadow_status(d, gpfn, smfn, psh_type); + + if ( pin ) + shadow_pin(smfn); + + return smfn; +} + +static void inline +free_shadow_l1_table(struct domain *d, unsigned long smfn) +{ + l1_pgentry_t *pl1e = map_domain_mem(smfn << PAGE_SHIFT); + int i; + + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + put_page_from_l1e(pl1e[i], d); + + unmap_domain_mem(pl1e); +} + +static void inline +free_shadow_hl2_table(struct domain *d, unsigned long smfn) +{ + printk("free_shadow_hl2_table(smfn=%p)\n", smfn); + + l1_pgentry_t *pl1e = map_domain_mem(smfn << PAGE_SHIFT); + int i, limit; + + if ( shadow_mode_external(d) ) + limit = L2_PAGETABLE_ENTRIES; + else + limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; + + for ( i = 0; i < limit; i++ ) + put_page_from_l1e(pl1e[i], d); + + unmap_domain_mem(pl1e); +} + +static void inline +free_shadow_l2_table(struct domain *d, unsigned long smfn) +{ + printk("free_shadow_l2_table(smfn=%p)\n", smfn); + + unsigned long *pl2e = map_domain_mem(smfn << PAGE_SHIFT); + int i, external = shadow_mode_external(d); + + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + if ( external || is_guest_l2_slot(i) ) + if ( pl2e[i] & _PAGE_PRESENT ) + put_shadow_ref(pl2e[i] >> PAGE_SHIFT); + + if ( (PGT_base_page_table == PGT_l2_page_table) && + shadow_mode_translate(d) && + !shadow_mode_external(d) ) + { + // free the ref to the hl2 + // + put_shadow_ref(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] + >> PAGE_SHIFT); + } + + unmap_domain_mem(pl2e); +} + +void free_shadow_page(unsigned long smfn) +{ + struct pfn_info *page = &frame_table[smfn]; + struct domain *d = unpickle_domptr(page->tlbflush_timestamp); + unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask; + unsigned long gpfn = __mfn_to_gpfn(d, gmfn); + unsigned long type = page->u.inuse.type_info & PGT_type_mask; + + ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) ); + + delete_shadow_status(d, gpfn, type); + + switch ( type ) { - case PGT_l1_page_table: + case PGT_l1_shadow: perfc_decr(shadow_l1_pages); + shadow_demote(d, gpfn, gmfn); + free_shadow_l1_table(d, smfn); break; - case PGT_l2_page_table: + case PGT_l2_shadow: perfc_decr(shadow_l2_pages); + shadow_demote(d, gpfn, gmfn); + free_shadow_l2_table(d, smfn); + break; + + case PGT_hl2_shadow: + perfc_decr(hl2_table_pages); + shadow_demote(d, gpfn, gmfn); + free_shadow_hl2_table(d, smfn); + break; + + case PGT_snapshot: + perfc_decr(snapshot_pages); break; default: - printk("Free shadow weird page type pfn=%08x type=%08x\n", - frame_table-page, page->u.inuse.type_info); + printk("Free shadow weird page type mfn=%08x type=%08x\n", + page-frame_table, page->u.inuse.type_info); break; } + d->arch.shadow_page_count--; + + // No TLB flushes are needed the next time this page gets allocated. + // + page->tlbflush_timestamp = 0; + page->u.free.cpu_mask = 0; + free_domheap_page(page); } -void free_shadow_state(struct domain *d) +static void inline +release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry) +{ + struct pfn_info *page; + + page = &frame_table[entry->gmfn]; + + // Decrement ref count of guest & shadow pages + // + put_page(page); + + // Only use entries that have low bits clear... + // + if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) ) + put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT); + + // Free the snapshot + // + shadow_free_snapshot(d, entry); +} + +static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn) +{ + struct out_of_sync_entry *entry = d->arch.out_of_sync; + struct out_of_sync_entry **prev = &d->arch.out_of_sync; + + while ( entry ) + { + if ( entry->gmfn == gmfn ) + { + release_out_of_sync_entry(d, entry); + *prev = entry = entry->next; + continue; + } + prev = &entry->next; + entry = entry->next; + } +} + +static void free_out_of_sync_state(struct domain *d) +{ + struct out_of_sync_entry *entry; + struct out_of_sync_entry **tail = NULL; + + // Add the list of out-of-sync entries to the free list of entries. + // Not the smartest code. But it works. + // + for ( entry = d->arch.out_of_sync; entry; entry = entry->next) + { + release_out_of_sync_entry(d, entry); + tail = &entry->next; + } + if ( tail ) + { + *tail = d->arch.out_of_sync_free; + d->arch.out_of_sync_free = d->arch.out_of_sync; + d->arch.out_of_sync = NULL; + } +} + +static void free_shadow_pages(struct domain *d) { int i, free = 0; struct shadow_status *x, *n; + struct exec_domain *e; /* * WARNING! The shadow page table must not currently be in use! @@ -58,21 +449,37 @@ void free_shadow_state(struct domain *d) if( !d->arch.shadow_ht ) return; - /* Free each hash chain in turn. */ + // first, remove any outstanding refs from out_of_sync entries... + // + free_out_of_sync_state(d); + + // second, remove any outstanding refs from ed->arch.shadow_table... + // + for_each_exec_domain(d, e) + { + if ( pagetable_val(e->arch.shadow_table) ) + { + put_shadow_ref(pagetable_val(e->arch.shadow_table) >> PAGE_SHIFT); + e->arch.shadow_table = mk_pagetable(0); + } + } + + // Now, the only refs to shadow pages that are left are from the shadow + // pages themselves. We can just free them. + // for ( i = 0; i < shadow_ht_buckets; i++ ) { /* Skip empty buckets. */ x = &d->arch.shadow_ht[i]; - if ( x->pfn == 0 ) + if ( x->gpfn_and_flags == 0 ) continue; /* Free the head page. */ - free_shadow_page( - d, &frame_table[x->smfn_and_flags & PSH_pfn_mask]); + free_shadow_page(x->smfn); /* Reinitialise the head node. */ - x->pfn = 0; - x->smfn_and_flags = 0; + x->gpfn_and_flags = 0; + x->smfn = 0; n = x->next; x->next = NULL; @@ -82,16 +489,15 @@ void free_shadow_state(struct domain *d) for ( x = n; x != NULL; x = n ) { /* Free the shadow page. */ - free_shadow_page( - d, &frame_table[x->smfn_and_flags & PSH_pfn_mask]); + free_shadow_page(x->smfn); /* Re-initialise the chain node. */ - x->pfn = 0; - x->smfn_and_flags = 0; + x->gpfn_and_flags = 0; + x->smfn = 0; /* Add to the free list. */ - n = x->next; - x->next = d->arch.shadow_ht_free; + n = x->next; + x->next = d->arch.shadow_ht_free; d->arch.shadow_ht_free = x; free++; @@ -103,80 +509,140 @@ void free_shadow_state(struct domain *d) SH_LOG("Free shadow table. Freed=%d.", free); } -static inline int clear_shadow_page( - struct domain *d, struct shadow_status *x) +void shadow_mode_init(void) { - unsigned long *p; - int restart = 0; - struct pfn_info *spage = &frame_table[x->smfn_and_flags & PSH_pfn_mask]; +} - // We don't clear hl2_table's here. At least not yet. - if ( x->pfn & PSH_hl2 ) - return 0; +static void alloc_monitor_pagetable(struct exec_domain *ed) +{ + unsigned long mmfn; + l2_pgentry_t *mpl2e; + struct pfn_info *mmfn_info; + struct domain *d = ed->domain; - switch ( spage->u.inuse.type_info & PGT_type_mask ) - { - /* We clear L2 pages by zeroing the guest entries. */ - case PGT_l2_page_table: - p = map_domain_mem((spage - frame_table) << PAGE_SHIFT); - if ( shadow_mode_external(d) ) - memset(p, 0, L2_PAGETABLE_ENTRIES * sizeof(*p)); - else - memset(p, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*p)); - unmap_domain_mem(p); - break; + ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */ - /* We clear L1 pages by freeing them: no benefit from zeroing them. */ - case PGT_l1_page_table: - delete_shadow_status(d, x->pfn); - free_shadow_page(d, spage); - restart = 1; /* We need to go to start of list again. */ - break; - } + mmfn_info = alloc_domheap_page(NULL); + ASSERT( mmfn_info ); + + mmfn = (unsigned long) (mmfn_info - frame_table); + mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT); + memset(mpl2e, 0, PAGE_SIZE); + + memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); + + mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = + mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK) + | __PAGE_HYPERVISOR); - return restart; + // map the phys_to_machine map into the Read-Only MPT space for this domain + mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = + mk_l2_pgentry(pagetable_val(ed->arch.phys_table) | __PAGE_HYPERVISOR); + + ed->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT); + ed->arch.monitor_vtable = mpl2e; } -static void clear_shadow_state(struct domain *d) +/* + * Free the pages for monitor_table and hl2_table + */ +void free_monitor_pagetable(struct exec_domain *ed) { - int i; - struct shadow_status *x; - - shadow_audit(d, 1); + l2_pgentry_t *mpl2e, hl2e; + unsigned long mfn; - for ( i = 0; i < shadow_ht_buckets; i++ ) - { - retry: - /* Skip empty buckets. */ - x = &d->arch.shadow_ht[i]; - if ( x->pfn == 0 ) - continue; + ASSERT( pagetable_val(ed->arch.monitor_table) ); + ASSERT( shadow_mode_external(ed->domain) ); + + mpl2e = ed->arch.monitor_vtable; - if ( clear_shadow_page(d, x) ) - goto retry; + /* + * First get the mfn for hl2_table by looking at monitor_table + */ + hl2e = mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]; + ASSERT(l2_pgentry_val(hl2e) & _PAGE_PRESENT); + mfn = l2_pgentry_val(hl2e) >> PAGE_SHIFT; + ASSERT(mfn); - for ( x = x->next; x != NULL; x = x->next ) - if ( clear_shadow_page(d, x) ) - goto retry; + put_shadow_ref(mfn); + unmap_domain_mem(mpl2e); - shadow_audit(d, 0); - } + /* + * Then free monitor_table. + */ + mfn = (pagetable_val(ed->arch.monitor_table)) >> PAGE_SHIFT; + free_domheap_page(&frame_table[mfn]); - SH_VLOG("Scan shadow table. l1=%d l2=%d", - perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages)); + ed->arch.monitor_table = mk_pagetable(0); + ed->arch.monitor_vtable = 0; } - -void shadow_mode_init(void) +int __shadow_mode_enable(struct domain *d, unsigned int mode) { -} + struct exec_domain *ed; + for_each_exec_domain(d, ed) + { + invalidate_shadow_ldt(ed); -int __shadow_mode_enable(struct domain *d, unsigned int mode) -{ - d->arch.shadow_mode = mode; + // We need to set these up for __update_pagetables(). + // See the comment there. + + /* + * arch.guest_vtable + */ + if ( ed->arch.guest_vtable && + (ed->arch.guest_vtable != __linear_l2_table) ) + { + unmap_domain_mem(ed->arch.guest_vtable); + } + if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) + ed->arch.guest_vtable = __linear_l2_table; + else + ed->arch.guest_vtable = NULL; + + /* + * arch.shadow_vtable + */ + if ( ed->arch.shadow_vtable && + (ed->arch.shadow_vtable != __shadow_linear_l2_table) ) + { + unmap_domain_mem(ed->arch.shadow_vtable); + } + if ( !(mode & SHM_external) ) + ed->arch.shadow_vtable = __shadow_linear_l2_table; + else + ed->arch.shadow_vtable = NULL; + + /* + * arch.hl2_vtable + */ + if ( ed->arch.hl2_vtable && + (ed->arch.hl2_vtable != __linear_hl2_table) ) + { + unmap_domain_mem(ed->arch.hl2_vtable); + } + if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) + ed->arch.hl2_vtable = __linear_hl2_table; + else + ed->arch.hl2_vtable = NULL; + + /* + * arch.monitor_table & arch.monitor_vtable + */ + if ( ed->arch.monitor_vtable ) + { + free_monitor_pagetable(ed); + } + if ( mode & SHM_external ) + { + alloc_monitor_pagetable(ed); + } + } - if (!d->arch.shadow_ht) + if ( !d->arch.shadow_ht ) { d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets); if ( d->arch.shadow_ht == NULL ) @@ -186,7 +652,7 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) shadow_ht_buckets * sizeof(struct shadow_status)); } - if ( shadow_mode_log_dirty(d) && !d->arch.shadow_dirty_bitmap) + if ( shadow_mode_log_dirty(d) && !d->arch.shadow_dirty_bitmap ) { d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63; d->arch.shadow_dirty_bitmap = @@ -201,6 +667,63 @@ int __shadow_mode_enable(struct domain *d, unsigned int mode) d->arch.shadow_dirty_bitmap_size/8); } + printk("audit1\n"); + _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK, __FILE__, __LINE__); + printk("audit1 done\n"); + + // Get rid of any shadow pages from any previous shadow mode. + // + free_shadow_pages(d); + + printk("audit2\n"); + _audit_domain(d, AUDIT_ALREADY_LOCKED | AUDIT_ERRORS_OK, __FILE__, __LINE__); + printk("audit2 done\n"); + + // Turn off writable page tables. + // It doesn't mix with shadow mode. + // + vm_assist(d, VMASST_CMD_disable, VMASST_TYPE_writable_pagetables); + + /* + * Tear down it's counts by disassembling its page-table-based ref counts. + * Also remove CR3's gcount/tcount. + * That leaves things like GDTs and LDTs and external refs in tact. + * + * Most pages will be writable tcount=0. + * Some will still be L1 tcount=0 or L2 tcount=0. + * Maybe some pages will be type none tcount=0. + * Pages granted external writable refs (via grant tables?) will + * still have a non-zero tcount. That's OK. + * + * gcounts will generally be 1 for PGC_allocated. + * GDTs and LDTs will have additional gcounts. + * Any grant-table based refs will still be in the gcount. + * + * We attempt to grab writable refs to each page (thus setting its type). + * Immediately put back those type refs. + * + * Assert that no pages are left with L1/L2/L3/L4 type. + */ + audit_adjust_pgtables(d, -1, 1); + d->arch.shadow_mode = mode; + + struct list_head *list_ent = d->page_list.next; + while ( list_ent != &d->page_list ) + { + struct pfn_info *page = list_entry(list_ent, struct pfn_info, list); + if ( !get_page_type(page, PGT_writable_page) ) + BUG(); + put_page_type(page); + + list_ent = page->list.next; + } + + audit_adjust_pgtables(d, 1, 1); + + printk("audit3\n"); + _audit_domain(d, AUDIT_ALREADY_LOCKED, __FILE__, __LINE__); + printk("audit3 done\n"); + return 0; nomem: @@ -219,13 +742,10 @@ int shadow_mode_enable(struct domain *d, unsigned int mode) return rc; } -void __shadow_mode_disable(struct domain *d) +static void free_shadow_ht_entries(struct domain *d) { struct shadow_status *x, *n; - free_shadow_state(d); - d->arch.shadow_mode = 0; - SH_VLOG("freed tables count=%d l1=%d l2=%d", d->arch.shadow_page_count, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages)); @@ -239,6 +759,8 @@ void __shadow_mode_disable(struct domain *d) } d->arch.shadow_ht_extras = NULL; + d->arch.shadow_ht_free = NULL; + ASSERT(d->arch.shadow_extras_count == 0); SH_LOG("freed extras, now %d", d->arch.shadow_extras_count); @@ -253,6 +775,45 @@ void __shadow_mode_disable(struct domain *d) d->arch.shadow_ht = NULL; } +static void free_out_of_sync_entries(struct domain *d) +{ + struct out_of_sync_entry *x, *n; + + n = d->arch.out_of_sync_extras; + while ( (x = n) != NULL ) + { + d->arch.out_of_sync_extras_count--; + n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size])); + xfree(x); + } + + d->arch.out_of_sync_extras = NULL; + d->arch.out_of_sync_free = NULL; + d->arch.out_of_sync = NULL; + + ASSERT(d->arch.out_of_sync_extras_count == 0); + FSH_LOG("freed extra out_of_sync entries, now %d", + d->arch.out_of_sync_extras_count); +} + +void __shadow_mode_disable(struct domain *d) +{ + // This needs rethinking for the full shadow mode stuff. + // + // Among other things, ref counts need to be restored to a sensible + // state for a non-shadow-mode guest... + // This is probably easiest to do by stealing code from audit_domain(). + // + BUG(); + + free_shadow_pages(d); + + d->arch.shadow_mode = 0; + + free_shadow_ht_entries(d); + free_out_of_sync_entries(d); +} + static int shadow_mode_table_op( struct domain *d, dom0_shadow_control_t *sc) { @@ -272,7 +833,7 @@ static int shadow_mode_table_op( switch ( op ) { case DOM0_SHADOW_CONTROL_OP_FLUSH: - free_shadow_state(d); + free_shadow_pages(d); d->arch.shadow_fault_count = 0; d->arch.shadow_dirty_count = 0; @@ -282,7 +843,7 @@ static int shadow_mode_table_op( break; case DOM0_SHADOW_CONTROL_OP_CLEAN: - clear_shadow_state(d); + free_shadow_pages(d); sc->stats.fault_count = d->arch.shadow_fault_count; sc->stats.dirty_count = d->arch.shadow_dirty_count; @@ -394,13 +955,13 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) break; case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST: - free_shadow_state(d); + free_shadow_pages(d); rc = __shadow_mode_enable(d, SHM_enable); break; case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY: - free_shadow_state(d); - rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_log_dirty); + free_shadow_pages(d); + rc = __shadow_mode_enable(d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty); break; default: @@ -418,249 +979,828 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) return rc; } -static inline struct pfn_info *alloc_shadow_page(struct domain *d) +/* + * XXX KAF: Why is this VMX specific? + */ +void vmx_shadow_clear_state(struct domain *d) +{ + SH_VVLOG("vmx_clear_shadow_state:"); + shadow_lock(d); + free_shadow_pages(d); + shadow_unlock(d); +} + +static unsigned long +shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn, + unsigned long smfn) { - struct pfn_info *page = alloc_domheap_page(NULL); + unsigned long hl2mfn; + l1_pgentry_t *hl2; + l2_pgentry_t *gl2; + int i, limit; - d->arch.shadow_page_count++; + ASSERT(PGT_base_page_table == PGT_l2_page_table); - if ( unlikely(page == NULL) ) + if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) ) + { + printk("Couldn't alloc an HL2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn); + BUG(); /* XXX Deal gracefully with failure. */ + } + + perfc_incrc(shadow_hl2_table_count); + + ASSERT( pagetable_val(current->arch.guest_table) == (gmfn << PAGE_SHIFT) ); + gl2 = current->arch.guest_vtable; + + hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT); + + if ( shadow_mode_external(d) ) + limit = L2_PAGETABLE_ENTRIES; + else + limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; + + for ( i = 0; i < limit; i++ ) + { + unsigned long gl2e = l2_pgentry_val(gl2[i]); + unsigned long mfn; + + if ( gl2e & _PAGE_PRESENT ) + { + mfn = __gpfn_to_mfn(d, gl2e >> PAGE_SHIFT); + hl2[i] = mk_l1_pgentry((mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + get_page(pfn_to_page(mfn), d); + } + else + hl2[i] = mk_l1_pgentry(0); + } + + if ( !shadow_mode_external(d) ) + { + memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0, + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); + + // Setup easy access to the GL2, SL2, and HL2 frames. + // + hl2[l2_table_offset(LINEAR_PT_VIRT_START)] = + mk_l1_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + mk_l1_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + hl2[l2_table_offset(PERDOMAIN_VIRT_START)] = + mk_l1_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + } + + unmap_domain_mem(hl2); + + return hl2mfn; +} + +/* + * This could take and use a snapshot, and validate the entire page at + * once, or it could continue to fault in entries one at a time... + * Might be worth investigating... + */ +static unsigned long shadow_l2_table( + struct domain *d, unsigned long gpfn, unsigned long gmfn) +{ + unsigned long smfn; + l2_pgentry_t *spl2e; + + SH_VVLOG("shadow_l2_table(gpfn=%p, gmfn=%p)", gpfn, gmfn); + + perfc_incrc(shadow_l2_table_count); + + if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) ) + { + printk("Couldn't alloc an L2 shadow for pfn=%p mfn=%p\n", gpfn, gmfn); + BUG(); /* XXX Deal gracefully with failure. */ + } + + spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT); + + /* Install hypervisor and 2x linear p.t. mapings. */ + if ( (PGT_base_page_table == PGT_l2_page_table) && + !shadow_mode_external(d) ) + { + /* + * We could proactively fill in PDEs for pages that are already + * shadowed *and* where the guest PDE has _PAGE_ACCESSED set + * (restriction required for coherence of the accessed bit). However, + * we tried it and it didn't help performance. This is simpler. + */ + memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t)); + + /* Install hypervisor and 2x linear p.t. mapings. */ + memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); + + if ( shadow_mode_translate(d) ) // NB: not external + { + unsigned long hl2mfn; + if ( unlikely(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow)) ) + hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn); + + // shadow_mode_translate (but not external) sl2 tables hold a + // ref to their hl2. + // + get_shadow_ref(hl2mfn); + + spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = + mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + } + else + spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = + mk_l2_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + + spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + + spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = + mk_l2_pgentry(__pa(page_get_owner( + &frame_table[gmfn])->arch.mm_perdomain_pt) | + __PAGE_HYPERVISOR); + } + else + { + memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t)); + } + + unmap_domain_mem(spl2e); + + SH_VLOG("shadow_l2_table(%p -> %p)", gmfn, smfn); + return smfn; +} + +void shadow_map_l1_into_current_l2(unsigned long va) +{ + struct exec_domain *ed = current; + struct domain *d = ed->domain; + unsigned long *gpl1e, *spl1e, gl2e, sl2e, gl1pfn, gl1mfn, sl1mfn; + int i, init_table = 0; + + __guest_get_l2e(ed, va, &gl2e); + ASSERT(gl2e & _PAGE_PRESENT); + gl1pfn = gl2e >> PAGE_SHIFT; + + if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) ) { - printk("Couldn't alloc shadow page! count=%d\n", - d->arch.shadow_page_count); - SH_VLOG("Shadow tables l1=%d l2=%d", - perfc_value(shadow_l1_pages), - perfc_value(shadow_l2_pages)); + /* This L1 is NOT already shadowed so we need to shadow it. */ + SH_VVLOG("4a: l1 not shadowed"); + + gl1mfn = __gpfn_to_mfn(d, gl1pfn); + if ( unlikely(!gl1mfn) ) + { + // Attempt to use an invalid pfn as an L1 page. + // XXX this needs to be more graceful! + BUG(); + } + + if ( unlikely(!(sl1mfn = + alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) ) + { + printk("Couldn't alloc an L1 shadow for pfn=%p mfn=%p\n", + gl1pfn, gl1mfn); + BUG(); /* XXX Need to deal gracefully with failure. */ + } + + perfc_incrc(shadow_l1_table_count); + init_table = 1; + } + else + { + /* This L1 is shadowed already, but the L2 entry is missing. */ + SH_VVLOG("4b: was shadowed, l2 missing (%p)", sl1mfn); + } + +#ifndef NDEBUG + unsigned long old_sl2e; + __shadow_get_l2e(ed, va, &old_sl2e); + ASSERT( !(old_sl2e & _PAGE_PRESENT) ); +#endif + + get_shadow_ref(sl1mfn); + l2pde_general(d, &gl2e, &sl2e, sl1mfn); + __guest_set_l2e(ed, va, gl2e); + __shadow_set_l2e(ed, va, sl2e); + + if ( init_table ) + { + gpl1e = (unsigned long *) + &(linear_pg_table[l1_linear_offset(va) & + ~(L1_PAGETABLE_ENTRIES-1)]); + + spl1e = (unsigned long *) + &(shadow_linear_pg_table[l1_linear_offset(va) & + ~(L1_PAGETABLE_ENTRIES-1)]); + + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + { + l1pte_propagate_from_guest(d, gpl1e[i], &spl1e[i]); + if ( spl1e[i] & _PAGE_PRESENT ) + get_page_from_l1e(mk_l1_pgentry(spl1e[i]), d); + } + } +} + +void shadow_invlpg(struct exec_domain *ed, unsigned long va) +{ + struct domain *d = ed->domain; + unsigned long gpte, spte; + + ASSERT(shadow_mode_enabled(d)); + + shadow_lock(d); + + __shadow_sync_va(ed, va); + + // XXX mafetter: will need to think about 4MB pages... + + // It's not strictly necessary to update the shadow here, + // but it might save a fault later. + // + if (__get_user(gpte, (unsigned long *) + &linear_pg_table[va >> PAGE_SHIFT])) { + perfc_incrc(shadow_invlpg_faults); + return; + } + l1pte_propagate_from_guest(d, gpte, &spte); + shadow_set_l1e(va, spte, 1); + + shadow_unlock(d); +} + +struct out_of_sync_entry * +shadow_alloc_oos_entry(struct domain *d) +{ + struct out_of_sync_entry *f, *extra; + unsigned size, i; + + if ( unlikely(d->arch.out_of_sync_free == NULL) ) + { + FSH_LOG("Allocate more fullshadow tuple blocks."); + + size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f)); + extra = xmalloc_bytes(size); + + /* XXX Should be more graceful here. */ + if ( extra == NULL ) + BUG(); + + memset(extra, 0, size); + + /* Record the allocation block so it can be correctly freed later. */ + d->arch.out_of_sync_extras_count++; + *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) = + d->arch.out_of_sync_extras; + d->arch.out_of_sync_extras = &extra[0]; + + /* Thread a free chain through the newly-allocated nodes. */ + for ( i = 0; i < (out_of_sync_extra_size - 1); i++ ) + extra[i].next = &extra[i+1]; + extra[i].next = NULL; + + /* Add the new nodes to the free list. */ + d->arch.out_of_sync_free = &extra[0]; + } + + /* Allocate a new node from the quicklist. */ + f = d->arch.out_of_sync_free; + d->arch.out_of_sync_free = f->next; + + return f; +} + +static unsigned long +shadow_make_snapshot( + struct domain *d, unsigned long gpfn, unsigned long gmfn) +{ + unsigned long smfn; + void *original, *snapshot; + + if ( test_and_set_bit(_PGC_out_of_sync, &frame_table[gmfn].count_info) ) + { + ASSERT(__shadow_status(d, gpfn, PGT_snapshot)); + return SHADOW_SNAPSHOT_ELSEWHERE; + } + + perfc_incrc(shadow_make_snapshot); + + if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) ) + { + printk("Couldn't alloc fullshadow snapshot for pfn=%p mfn=%p!\n" + "Dom%d snapshot_count_count=%d\n", + gpfn, gmfn, d->id, d->arch.snapshot_page_count); BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */ } - return page; + get_shadow_ref(smfn); + + original = map_domain_mem(gmfn << PAGE_SHIFT); + snapshot = map_domain_mem(smfn << PAGE_SHIFT); + memcpy(snapshot, original, PAGE_SIZE); + unmap_domain_mem(original); + unmap_domain_mem(snapshot); + + return smfn; +} + +static void +shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry) +{ + void *snapshot; + + if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE ) + return; + + // Clear the out_of_sync bit. + // + clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info); + + // XXX Need to think about how to protect the domain's + // information less expensively. + // + snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT); + memset(snapshot, 0, PAGE_SIZE); + unmap_domain_mem(snapshot); + + put_shadow_ref(entry->snapshot_mfn); +} + +struct out_of_sync_entry * +shadow_mark_mfn_out_of_sync(struct exec_domain *ed, unsigned long gpfn, + unsigned long mfn) +{ + struct domain *d = ed->domain; + struct pfn_info *page = &frame_table[mfn]; + struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d); + + ASSERT(spin_is_locked(&d->arch.shadow_lock)); + ASSERT(pfn_is_ram(mfn)); + //ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page); + if (!((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page)) + { + printk("assertion failed: gpfn=%p gmfn=%p t=%p\n", + gpfn, mfn, page->u.inuse.type_info); + BUG(); + } + + FSH_LOG("mark_mfn_out_of_sync(gpfn=%p, mfn=%p) c=%p t=%p", + gpfn, mfn, page->count_info, page->u.inuse.type_info); + + // XXX this will require some more thought... Cross-domain sharing and + // modification of page tables? Hmm... + // + if ( d != page_get_owner(page) ) + BUG(); + + perfc_incrc(shadow_mark_mfn_out_of_sync_calls); + + entry->gpfn = gpfn; + entry->gmfn = mfn; + entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn); + entry->writable_pl1e = -1; + + // increment guest's ref count to represent the entry in the + // full shadow out-of-sync list. + // + get_page(page, d); + + // Add to the out-of-sync list + // + entry->next = d->arch.out_of_sync; + d->arch.out_of_sync = entry; + + return entry; +} + +void shadow_mark_out_of_sync( + struct exec_domain *ed, unsigned long gpfn, unsigned long mfn, unsigned long va) +{ + struct out_of_sync_entry *entry = + shadow_mark_mfn_out_of_sync(ed, gpfn, mfn); + unsigned long sl2e; + + // We need the address of shadow PTE that maps @va. + // It might not exist yet. Make sure it's there. + // + __shadow_get_l2e(ed, va, &sl2e); + if ( !(sl2e & _PAGE_PRESENT) ) + { + // either this L1 isn't shadowed yet, or the shadow isn't linked into + // the current L2. + shadow_map_l1_into_current_l2(va); + __shadow_get_l2e(ed, va, &sl2e); + } + ASSERT(sl2e & _PAGE_PRESENT); + + // NB: this is stored as a machine address. + entry->writable_pl1e = + ((sl2e & PAGE_MASK) | + (sizeof(l1_pgentry_t) * l1_table_offset(va))); + ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) ); + + // Increment shadow's page count to represent the reference + // inherent in entry->writable_pl1e + // + get_shadow_ref(sl2e >> PAGE_SHIFT); + + FSH_LOG("mark_out_of_sync(va=%p -> writable_pl1e=%p)", + va, entry->writable_pl1e); +} + +/* + * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches. + * Returns 0 otherwise. + */ +static int snapshot_entry_matches( + struct exec_domain *ed, unsigned long gmfn, unsigned index) +{ + unsigned long gpfn = __mfn_to_gpfn(ed->domain, gmfn); + unsigned long smfn = __shadow_status(ed->domain, gpfn, PGT_snapshot); + unsigned long *guest, *snapshot; + int compare; + + ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) ); + + perfc_incrc(snapshot_entry_matches_calls); + + if ( !smfn ) + return 0; + + guest = map_domain_mem(gmfn << PAGE_SHIFT); + snapshot = map_domain_mem(smfn << PAGE_SHIFT); + + // This could probably be smarter, but this is sufficent for + // our current needs. + // + compare = (guest[index] == snapshot[index]); + + unmap_domain_mem(guest); + unmap_domain_mem(snapshot); + +#ifdef PERF_COUNTERS + if ( compare ) + perfc_incrc(snapshot_entry_matches_true); +#endif + + return compare; +} + +/* + * Returns 1 if va's shadow mapping is out-of-sync. + * Returns 0 otherwise. + */ +int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va) +{ + struct domain *d = ed->domain; + unsigned long l2mfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT; + unsigned long l2e; + unsigned long l1mfn; + + ASSERT(spin_is_locked(&d->arch.shadow_lock)); + + perfc_incrc(shadow_out_of_sync_calls); + + if ( page_out_of_sync(&frame_table[l2mfn]) && + !snapshot_entry_matches(ed, l2mfn, l2_table_offset(va)) ) + return 1; + + __guest_get_l2e(ed, va, &l2e); + if ( !(l2e & _PAGE_PRESENT) ) + return 0; + + l1mfn = __gpfn_to_mfn(d, l2e >> PAGE_SHIFT); + + // If the l1 pfn is invalid, it can't be out of sync... + if ( !l1mfn ) + return 0; + + if ( page_out_of_sync(&frame_table[l1mfn]) && + !snapshot_entry_matches(ed, l1mfn, l1_table_offset(va)) ) + return 1; + + return 0; +} + +static u32 remove_all_write_access_in_ptpage( + struct domain *d, unsigned long pt_mfn, unsigned long readonly_mfn) +{ + unsigned long *pt = map_domain_mem(pt_mfn << PAGE_SHIFT); + unsigned long match = + (readonly_mfn << PAGE_SHIFT) | _PAGE_RW | _PAGE_PRESENT; + unsigned long mask = PAGE_MASK | _PAGE_RW | _PAGE_PRESENT; + int i; + u32 count = 0; + int is_l1_shadow = + ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) == + PGT_l1_shadow); + + for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) + { + if ( unlikely(((pt[i] ^ match) & mask) == 0) ) + { + unsigned long old = pt[i]; + unsigned long new = old & ~_PAGE_RW; + + if ( is_l1_shadow ) + get_page_from_l1e(mk_l1_pgentry(new), d); + + count++; + pt[i] = new; + + if ( is_l1_shadow ) + put_page_from_l1e(mk_l1_pgentry(old), d); + + FSH_LOG("removed write access to mfn=%p in smfn=%p entry %x " + "is_l1_shadow=%d\n", + readonly_mfn, pt_mfn, i, is_l1_shadow); + } + } + + unmap_domain_mem(pt); + + return count; } -void unshadow_table(unsigned long gpfn, unsigned int type) +u32 shadow_remove_all_write_access( + struct domain *d, unsigned min_type, unsigned max_type, unsigned long gpfn) { - unsigned long smfn; - struct domain *d = page_get_owner(&frame_table[gpfn]); - - SH_VLOG("unshadow_table type=%08x gpfn=%p", type, gpfn); + int i; + struct shadow_status *a; + unsigned long gmfn = __gpfn_to_mfn(d, gpfn); + unsigned long sl1mfn = __shadow_status(d, gpfn, PGT_l1_shadow); + u32 count = 0; - perfc_incrc(unshadow_table_count); + ASSERT(spin_is_locked(&d->arch.shadow_lock)); + ASSERT(gmfn); - /* - * This function is the same for all p.t. pages. Even for multi-processor - * guests there won't be a race here as this CPU was the one that - * cmpxchg'ed the page to invalid. - */ - smfn = __shadow_status(d, gpfn) & PSH_pfn_mask; - delete_shadow_status(d, gpfn); - free_shadow_page(d, &frame_table[smfn]); -} + for (i = 0; i < shadow_ht_buckets; i++) + { + a = &d->arch.shadow_ht[i]; + while ( a && a->gpfn_and_flags ) + { + if ( ((a->gpfn_and_flags & PGT_type_mask) >= min_type) && + ((a->gpfn_and_flags & PGT_type_mask) <= max_type) ) + { + switch ( a->gpfn_and_flags & PGT_type_mask ) + { + case PGT_l1_shadow: + count += + remove_all_write_access_in_ptpage(d, a->smfn, gmfn); + break; + case PGT_l2_shadow: + if ( sl1mfn ) + count += + remove_all_write_access_in_ptpage(d, a->smfn, + sl1mfn); + break; + case PGT_hl2_shadow: + // nothing to do here... + break; + default: + // need to flush this out for 4 level page tables. + BUG(); + } + } + a = a->next; + } + } -/* - * XXX KAF: - * 1. Why is this VMX specific? - * 2. Why is VMX using clear_state() rather than free_state()? - * (could we get rid of clear_state and fold into free_state?) - */ -void vmx_shadow_clear_state(struct domain *d) -{ - SH_VVLOG("vmx_clear_shadow_state:"); - shadow_lock(d); - clear_shadow_state(d); - shadow_unlock(d); + return count; } -unsigned long shadow_l2_table( - struct domain *d, unsigned long gmfn) +static u32 remove_all_access_in_page( + struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn) { - struct pfn_info *spfn_info; - unsigned long spfn; - unsigned long gpfn; + unsigned long *pl1e = map_domain_mem(l1mfn << PAGE_SHIFT); + unsigned long match = (forbidden_gmfn << PAGE_SHIFT) | _PAGE_PRESENT; + unsigned long mask = PAGE_MASK | _PAGE_PRESENT; + int i; + u32 count = 0; + int is_l1_shadow = + ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) == + PGT_l1_shadow); - gpfn = __mfn_to_gpfn(d, gmfn); + for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) + { + if ( unlikely(((pl1e[i] ^ match) & mask) == 0) ) + { + unsigned long ol2e = pl1e[i]; + pl1e[i] = 0; + count++; + + if ( is_l1_shadow ) + put_page_from_l1e(mk_l1_pgentry(ol2e), d); + else /* must be an hl2 page */ + put_page(&frame_table[forbidden_gmfn]); + } + } - SH_VVLOG("shadow_l2_table( %p )", gmfn); + unmap_domain_mem(pl1e); - perfc_incrc(shadow_l2_table_count); + return count; +} - if ( (spfn_info = alloc_shadow_page(d)) == NULL ) - BUG(); /* XXX Deal gracefully with failure. */ +u32 shadow_remove_all_access(struct domain *d, unsigned long gmfn) +{ + int i; + struct shadow_status *a; + u32 count = 0; - spfn_info->u.inuse.type_info = PGT_l2_page_table; - perfc_incr(shadow_l2_pages); + ASSERT(spin_is_locked(&d->arch.shadow_lock)); - spfn = page_to_pfn(spfn_info); - /* Mark pfn as being shadowed; update field to point at shadow. */ - set_shadow_status(d, gpfn, spfn | PSH_shadowed); - -#ifdef __i386__ - /* Install hypervisor and 2x linear p.t. mapings. */ - if ( !shadow_mode_translate(d) ) + for (i = 0; i < shadow_ht_buckets; i++) { - l2_pgentry_t *spl2e; - spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT); - /* - * We could proactively fill in PDEs for pages that are already - * shadowed *and* where the guest PDE has _PAGE_ACCESSED set - * (restriction required for coherence of the accessed bit). However, - * we tried it and it didn't help performance. This is simpler. - */ - memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t)); - - /* Install hypervisor and 2x linear p.t. mapings. */ - memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], - HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); - spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry((gmfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = - mk_l2_pgentry(__pa(page_get_owner( - &frame_table[gmfn])->arch.mm_perdomain_pt) | - __PAGE_HYPERVISOR); - - unmap_domain_mem(spl2e); + a = &d->arch.shadow_ht[i]; + while ( a && a->gpfn_and_flags ) + { + if ( ((a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow) || + ((a->gpfn_and_flags & PGT_type_mask) == PGT_hl2_shadow) ) + { + count += remove_all_access_in_page(d, a->smfn, gmfn); + } + a = a->next; + } } -#endif - SH_VLOG("shadow_l2_table( %p -> %p)", gmfn, spfn); - return spfn; -} - -static void shadow_map_l1_into_current_l2(unsigned long va) -{ - struct exec_domain *ed = current; - struct domain *d = ed->domain; - unsigned long *gpl1e, *spl1e, gl2e, sl2e, gl1pfn, sl1mfn, sl1ss; - struct pfn_info *sl1mfn_info; - int i; + return count; +} - __guest_get_l2e(ed, va, &gl2e); +static int resync_all(struct domain *d, u32 stype) +{ + struct out_of_sync_entry *entry; + unsigned i; + unsigned long smfn; + unsigned long *guest, *shadow, *snapshot; + int need_flush = 0, external = shadow_mode_external(d); - gl1pfn = gl2e >> PAGE_SHIFT; + ASSERT(spin_is_locked(&d->arch.shadow_lock)); - sl1ss = __shadow_status(d, gl1pfn); - if ( !(sl1ss & PSH_shadowed) ) + for ( entry = d->arch.out_of_sync; entry; entry = entry->next) { - /* This L1 is NOT already shadowed so we need to shadow it. */ - SH_VVLOG("4a: l1 not shadowed ( %p )", sl1ss); + if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE ) + continue; - sl1mfn_info = alloc_shadow_page(d); - sl1mfn_info->u.inuse.type_info = PGT_l1_page_table; - - sl1mfn = sl1mfn_info - frame_table; + if ( !(smfn = __shadow_status(d, entry->gpfn, stype)) ) + continue; - perfc_incrc(shadow_l1_table_count); - perfc_incr(shadow_l1_pages); + FSH_LOG("resyncing t=%p gpfn=%p gmfn=%p smfn=%p snapshot_mfn=%p", + stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn); - set_shadow_status(d, gl1pfn, PSH_shadowed | sl1mfn); + // Compare guest's new contents to its snapshot, validating + // and updating its shadow as appropriate. + // + guest = map_domain_mem(entry->gmfn << PAGE_SHIFT); + snapshot = map_domain_mem(entry->snapshot_mfn << PAGE_SHIFT); + shadow = map_domain_mem(smfn << PAGE_SHIFT); - l2pde_general(d, &gl2e, &sl2e, sl1mfn); + switch ( stype ) { + case PGT_l1_shadow: + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + { + unsigned new_pte = guest[i]; + if ( new_pte != snapshot[i] ) + { + need_flush |= validate_pte_change(d, new_pte, &shadow[i]); - __guest_set_l2e(ed, va, gl2e); - __shadow_set_l2e(ed, va, sl2e); + // can't update snapshots of linear page tables -- they + // are used multiple times... + // + // snapshot[i] = new_pte; + } + } + break; + case PGT_l2_shadow: + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + { + if ( !is_guest_l2_slot(i) && !external ) + continue; - gpl1e = (unsigned long *) &(linear_pg_table[ - (va>>L1_PAGETABLE_SHIFT) & ~(L1_PAGETABLE_ENTRIES-1)]); + unsigned new_pde = guest[i]; + if ( new_pde != snapshot[i] ) + { + need_flush |= validate_pde_change(d, new_pde, &shadow[i]); - spl1e = (unsigned long *) &(shadow_linear_pg_table[ - (va>>L1_PAGETABLE_SHIFT) & ~(L1_PAGETABLE_ENTRIES-1)]); + // can't update snapshots of linear page tables -- they + // are used multiple times... + // + // snapshot[i] = new_pde; + } + } + break; + default: + BUG(); + break; + } - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - l1pte_propagate_from_guest(d, &gpl1e[i], &spl1e[i]); + unmap_domain_mem(shadow); + unmap_domain_mem(snapshot); + unmap_domain_mem(guest); } - else - { - /* This L1 is shadowed already, but the L2 entry is missing. */ - SH_VVLOG("4b: was shadowed, l2 missing ( %p )", sl1ss); - sl1mfn = sl1ss & PSH_pfn_mask; - l2pde_general(d, &gl2e, &sl2e, sl1mfn); - __guest_set_l2e(ed, va, gl2e); - __shadow_set_l2e(ed, va, sl2e); - } + return need_flush; } -void shadow_invlpg(struct exec_domain *ed, unsigned long va) +void __shadow_sync_all(struct domain *d) { - unsigned long gpte, spte; + struct out_of_sync_entry *entry; + int need_flush = 0; - ASSERT(shadow_mode_enabled(ed->domain)); + perfc_incrc(shadow_sync_all); - /* - * XXX KAF: Why is this set-to-zero required? - * Why, on failure, must we bin all our shadow state? - */ - if (__put_user(0L, (unsigned long *) - &shadow_linear_pg_table[va >> PAGE_SHIFT])) { - vmx_shadow_clear_state(ed->domain); - return; - } + ASSERT(spin_is_locked(&d->arch.shadow_lock)); - if (__get_user(gpte, (unsigned long *) - &linear_pg_table[va >> PAGE_SHIFT])) { - return; - } + // First, remove all write permissions to the page tables + // + for ( entry = d->arch.out_of_sync; entry; entry = entry->next) + { + // Skip entries that have low bits set... Those aren't + // real PTEs. + // + if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) ) + continue; - l1pte_propagate_from_guest(ed->domain, &gpte, &spte); + unsigned long *ppte = map_domain_mem(entry->writable_pl1e); + unsigned long opte = *ppte; + unsigned long npte = opte & ~_PAGE_RW; - if (__put_user(spte, (unsigned long *) - &shadow_linear_pg_table[va >> PAGE_SHIFT])) { - return; + get_page_from_l1e(mk_l1_pgentry(npte), d); + *ppte = npte; + put_page_from_l1e(mk_l1_pgentry(opte), d); + + unmap_domain_mem(ppte); } + + // XXX mafetter: SMP perf bug. + // + // With the current algorithm, we've gotta flush all the TLBs + // before we can safely continue. I don't think we want to + // do it this way, so I think we should consider making + // entirely private copies of the shadow for each vcpu, and/or + // possibly having a mix of private and shared shadow state + // (any path from a PTE that grants write access to an out-of-sync + // page table page needs to be vcpu private). + // + flush_tlb_all(); + + // Second, resync all L1 pages, then L2 pages, etc... + // + need_flush |= resync_all(d, PGT_l1_shadow); + if ( shadow_mode_translate(d) ) + need_flush |= resync_all(d, PGT_hl2_shadow); + need_flush |= resync_all(d, PGT_l2_shadow); + + if ( need_flush ) + local_flush_tlb(); + + free_out_of_sync_state(d); } int shadow_fault(unsigned long va, struct xen_regs *regs) { - unsigned long gpte, spte = 0; + unsigned long gpte, spte = 0, orig_gpte; struct exec_domain *ed = current; struct domain *d = ed->domain; + unsigned long gpde; SH_VVLOG("shadow_fault( va=%p, code=%lu )", va, regs->error_code ); - - check_pagetable(d, ed->arch.guest_table, "pre-sf"); + perfc_incrc(shadow_fault_calls); + + check_pagetable(ed, "pre-sf"); /* - * STEP 1. A fast-reject set of checks with no locking. + * Don't let someone else take the guest's table pages out-of-sync. */ + shadow_lock(d); - if ( unlikely(__get_user(gpte, (unsigned long *) - &linear_pg_table[va >> PAGE_SHIFT])) ) - { - SH_VVLOG("shadow_fault - EXIT: read gpte faulted" ); - return 0; - } - - if ( !(gpte & _PAGE_PRESENT) ) - { - SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte ); - return 0; - } - - if ( (regs->error_code & 2) && !(gpte & _PAGE_RW) ) - { - /* Write fault on a read-only mapping. */ - return 0; - } + /* XXX - FIX THIS COMMENT!!! + * STEP 1. Check to see if this fault might have been caused by an + * out-of-sync table page entry, or if we should pass this + * fault onto the guest. + */ + __shadow_sync_va(ed, va); /* - * STEP 2. Take the shadow lock and re-check the guest PTE. + * STEP 2. Check the guest PTE. */ - - shadow_lock(d); - - if ( unlikely(__get_user(gpte, (unsigned long *) - &linear_pg_table[va >> PAGE_SHIFT])) ) + __guest_get_l2e(ed, va, &gpde); + if ( unlikely(!(gpde & _PAGE_PRESENT)) ) { - SH_VVLOG("shadow_fault - EXIT: read gpte faulted2" ); + SH_VVLOG("shadow_fault - EXIT: L1 not present" ); + perfc_incrc(shadow_fault_bail_pde_not_present); shadow_unlock(d); return 0; } + // This can't fault because we hold the shadow lock and we've ensured that + // the mapping is in-sync, so the check of the PDE's present bit, above, + // covers this access. + // + orig_gpte = gpte = l1_pgentry_val(linear_pg_table[l1_linear_offset(va)]); if ( unlikely(!(gpte & _PAGE_PRESENT)) ) { - SH_VVLOG("shadow_fault - EXIT: gpte not present2 (%lx)",gpte ); + SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte ); + perfc_incrc(shadow_fault_bail_pte_not_present); shadow_unlock(d); return 0; } @@ -672,11 +1812,12 @@ int shadow_fault(unsigned long va, struct xen_regs *regs) { /* Write fault on a read-only mapping. */ SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", gpte); + perfc_incrc(shadow_fault_bail_ro_mapping); shadow_unlock(d); return 0; } - l1pte_write_fault(d, &gpte, &spte); + l1pte_write_fault(ed, &gpte, &spte, va); } else { @@ -689,120 +1830,141 @@ int shadow_fault(unsigned long va, struct xen_regs *regs) /* XXX Watch out for read-only L2 entries! (not used in Linux). */ if ( unlikely(__put_user(gpte, (unsigned long *) - &linear_pg_table[va >> PAGE_SHIFT])) ) - domain_crash(); - - /* - * Update of shadow PTE can fail because the L1 p.t. is not shadowed, - * or because the shadow isn't linked into this shadow L2 p.t. - */ - if ( unlikely(__put_user(spte, (unsigned long *) - &shadow_linear_pg_table[va >> PAGE_SHIFT])) ) + &linear_pg_table[l1_linear_offset(va)])) ) { - SH_VVLOG("3: not shadowed/mapped gpte=%p spte=%p", gpte, spte); - shadow_map_l1_into_current_l2(va); - shadow_linear_pg_table[va >> PAGE_SHIFT] = mk_l1_pgentry(spte); + printk("shadow_fault(): crashing domain %d " + "due to a read-only L2 page table (gpde=%p), va=%p\n", + d->id, gpde, va); + domain_crash(); } - perfc_incrc(shadow_fixup_count); + // if necessary, record the page table page as dirty + if ( unlikely(shadow_mode_log_dirty(d)) && (orig_gpte != gpte) ) + mark_dirty(d, __gpfn_to_mfn(d, gpde >> PAGE_SHIFT)); + + shadow_set_l1e(va, spte, 1); + + perfc_incrc(shadow_fault_fixed); d->arch.shadow_fault_count++; shadow_unlock(d); - check_pagetable(d, ed->arch.guest_table, "post-sf"); + check_pagetable(ed, "post-sf"); return EXCRET_fault_fixed; } - -void shadow_l1_normal_pt_update( - unsigned long pa, unsigned long gpte, - unsigned long *prev_smfn_ptr, - l1_pgentry_t **prev_spl1e_ptr) +/* + * What lives where in the 32-bit address space in the various shadow modes, + * and what it uses to get/maintain that mapping. + * + * SHADOW MODE: none enable translate external + * + * 4KB things: + * guest_vtable lin_l2 mapped per gpdt lin_l2 via hl2 mapped per gpdt + * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gpdt + * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gpdt + * monitor_vtable n/a n/a n/a mapped once + * + * 4MB things: + * guest_linear lin via gpdt lin via gpdt lin via hl2 lin via hl2 + * shadow_linear n/a sh_lin via spdt sh_lin via spdt sh_lin via spdt + * monitor_linear n/a n/a n/a ??? + * perdomain perdomain perdomain perdomain perdomain + * R/O M2P R/O M2P R/O M2P n/a n/a + * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P + * P2M n/a n/a R/O M2P R/O M2P + * + * NB: + * update_pagetables(), __update_pagetables(), shadow_mode_enable(), + * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable() + * all play a part in maintaining these mappings. + */ +void __update_pagetables(struct exec_domain *ed) { - unsigned long smfn, spte, prev_smfn = *prev_smfn_ptr; - l1_pgentry_t *spl1e, *prev_spl1e = *prev_spl1e_ptr; + struct domain *d = ed->domain; + unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT; + unsigned long gpfn = __mfn_to_gpfn(d, gmfn); + unsigned long smfn, hl2mfn; - /* N.B. To get here, we know the l1 page *must* be shadowed. */ - SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%p, " - "prev_smfn=%p, prev_spl1e=%p", - pa, gpte, prev_smfn, prev_spl1e); + int max_mode = ( shadow_mode_external(d) ? SHM_external + : shadow_mode_translate(d) ? SHM_translate + : shadow_mode_enabled(d) ? SHM_enable + : 0 ); - smfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask; + ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) ); + ASSERT( max_mode ); - if ( smfn == prev_smfn ) - { - spl1e = prev_spl1e; - } - else + /* + * arch.guest_vtable + */ + if ( max_mode & (SHM_enable | SHM_external) ) { - if ( prev_spl1e != NULL ) - unmap_domain_mem( prev_spl1e ); - spl1e = (l1_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT); - *prev_smfn_ptr = smfn; - *prev_spl1e_ptr = spl1e; + if ( likely(ed->arch.guest_vtable != NULL) ) + unmap_domain_mem(ed->arch.guest_vtable); + ed->arch.guest_vtable = map_domain_mem(gmfn << PAGE_SHIFT); } - l1pte_propagate_from_guest(current->domain, &gpte, &spte); - spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = mk_l1_pgentry(spte); -} - -void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde) -{ - unsigned long sl2mfn, spde = 0; - l2_pgentry_t *spl2e; - unsigned long sl1mfn; - - /* N.B. To get here, we know the l2 page *must* be shadowed. */ - SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%p",pa,gpde); + /* + * arch.shadow_table + */ + if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) ) + smfn = shadow_l2_table(d, gpfn, gmfn); + get_shadow_ref(smfn); + if ( pagetable_val(ed->arch.shadow_table) ) + put_shadow_ref(pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT); + ed->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT); - sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask; + SH_VVLOG("0: __update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn); /* - * Only propagate to shadow if _PAGE_ACCESSED is set in the guest. - * Otherwise, to ensure coherency, we blow away the existing shadow value. + * arch.shadow_vtable */ - if ( gpde & _PAGE_ACCESSED ) + if ( max_mode == SHM_external ) { - sl1mfn = (gpde & _PAGE_PRESENT) ? - __shadow_status(current->domain, gpde >> PAGE_SHIFT) : 0; - l2pde_general(current->domain, &gpde, &spde, sl1mfn); + if ( ed->arch.shadow_vtable ) + unmap_domain_mem(ed->arch.shadow_vtable); + ed->arch.shadow_vtable = map_domain_mem(smfn << PAGE_SHIFT); } - spl2e = (l2_pgentry_t *)map_domain_mem(sl2mfn << PAGE_SHIFT); - spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)] = mk_l2_pgentry(spde); - unmap_domain_mem(spl2e); -} + /* + * arch.hl2_vtable + */ -unsigned long mk_hl2_table(struct exec_domain *ed) -{ - struct domain *d = ed->domain; - unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT; - unsigned long gpfn = __mfn_to_gpfn(d, gmfn); - unsigned long hl2mfn, status; - struct pfn_info *hl2_info; - l1_pgentry_t *hl2; + // if max_mode == SHM_translate, then the hl2 is already installed + // correctly in its smfn, and there's nothing to do. + // + if ( max_mode == SHM_external ) + { + if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) ) + hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn); + get_shadow_ref(hl2mfn); - perfc_incr(hl2_table_pages); + if ( ed->arch.hl2_vtable ) + unmap_domain_mem(ed->arch.hl2_vtable); + ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT); + } - if ( (hl2_info = alloc_shadow_page(d)) == NULL ) - BUG(); /* XXX Deal gracefully with failure. */ + /* + * fixup pointers in monitor table, as necessary + */ + if ( max_mode == SHM_external ) + { + l2_pgentry_t *mpl2e = ed->arch.monitor_vtable; - hl2_info->u.inuse.type_info = PGT_l1_page_table; + ASSERT( shadow_mode_translate(d) ); - hl2mfn = page_to_pfn(hl2_info); - status = hl2mfn | PSH_hl2; - set_shadow_status(ed->domain, gpfn | PSH_hl2, status); + mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = + mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - // need to optimize this... - hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT); - memset(hl2, 0, PAGE_SIZE); - unmap_domain_mem(hl2); + mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - return status; + // XXX - maybe this can be optimized somewhat?? + local_flush_tlb(); + } } - /************************************************************************/ /************************************************************************/ /************************************************************************/ @@ -838,12 +2000,13 @@ int shadow_status_noswap; static int check_pte( struct domain *d, unsigned long *pgpte, unsigned long *pspte, - int level, int l2_idx, int l1_idx) + int level, int l2_idx, int l1_idx, int oos_ptes) { unsigned gpte = *pgpte; unsigned spte = *pspte; - unsigned long mask, gpfn, smfn; + unsigned long mask, gpfn, smfn, gmfn; int errors = 0; + int page_table_page; if ( (spte == 0) || (spte == 0xdeadface) || (spte == 0x00000E00) ) return errors; /* always safe */ @@ -862,21 +2025,36 @@ static int check_pte( if ( (spte & mask) != (gpte & mask) ) FAIL("Corrupt?"); - if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) ) + if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) && !oos_ptes ) FAIL("Dirty coherence"); - if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) ) + if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) && !oos_ptes ) FAIL("Accessed coherence"); - if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) ) + smfn = spte >> PAGE_SHIFT; + gpfn = gpte >> PAGE_SHIFT; + gmfn = __gpfn_to_mfn(d, gpfn); + + page_table_page = mfn_is_page_table(gmfn); + + if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) && !oos_ptes ) + { + printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d oos_ptes=%d\n", + gpfn, gmfn, smfn, + frame_table[gmfn].u.inuse.type_info, + page_table_page, oos_ptes); FAIL("RW coherence"); + } - if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) ) + if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) && !oos_ptes ) + { + printk("gpfn=%p gmfn=%p smfn=%p t=0x%08x page_table_page=%d oos_ptes=%d\n", + gpfn, gmfn, smfn, + frame_table[gmfn].u.inuse.type_info, + page_table_page, oos_ptes); FAIL("RW2 coherence"); + } - smfn = spte >> PAGE_SHIFT; - gpfn = gpte >> PAGE_SHIFT; - if ( gpfn == smfn ) { if ( level > 1 ) @@ -887,23 +2065,26 @@ static int check_pte( if ( level < 2 ) FAIL("Shadow in L1 entry?"); - if ( __shadow_status(d, gpfn) != (PSH_shadowed | smfn) ) - FAIL("smfn problem g.sf=%p", - __shadow_status(d, gpfn) ); + if ( level == 2 ) + { + if ( __shadow_status(d, gpfn, PGT_l1_shadow) != smfn ) + FAIL("smfn problem gpfn=%p smfn=%p", gpfn, + __shadow_status(d, gpfn, PGT_l1_shadow)); + } + else + BUG(); // XXX -- not handled yet. } return errors; } - static int check_l1_table( - struct domain *d, + struct domain *d, unsigned long gpfn, unsigned long gmfn, unsigned long smfn, unsigned l2_idx) { int i; unsigned long *gpl1e, *spl1e; - int cpu = current->processor; - int errors = 0; + int errors = 0, oos_ptes = 0; // First check to see if this guest page is currently the active // PTWR page. If so, then we compare the (old) cached copy of the @@ -912,6 +2093,8 @@ static int check_l1_table( // if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) ) { + int cpu = current->processor; + for ( i = 0; i < ARRAY_SIZE(ptwr_info->ptinfo); i++) { if ( ptwr_info[cpu].ptinfo[i].l1va && @@ -925,11 +2108,18 @@ static int check_l1_table( } } + if ( page_out_of_sync(pfn_to_page(gmfn)) ) + { + gmfn = __shadow_status(d, gpfn, PGT_snapshot); + oos_ptes = 1; + ASSERT(gmfn); + } + gpl1e = map_domain_mem(gmfn << PAGE_SHIFT); spl1e = map_domain_mem(smfn << PAGE_SHIFT); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - errors += check_pte(d, &gpl1e[i], &spl1e[i], 1, l2_idx, i); + errors += check_pte(d, &gpl1e[i], &spl1e[i], 1, l2_idx, i, oos_ptes); unmap_domain_mem(spl1e); unmap_domain_mem(gpl1e); @@ -944,20 +2134,23 @@ static int check_l1_table( } while ( 0 ) int check_l2_table( - struct domain *d, unsigned long gpfn, unsigned long smfn) + struct domain *d, unsigned long gmfn, unsigned long smfn, int oos_pdes) { - unsigned long gmfn = __gpfn_to_mfn(d, gpfn); - l2_pgentry_t *gpl2e = (l2_pgentry_t *) map_domain_mem( gmfn << PAGE_SHIFT ); - l2_pgentry_t *spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT ); + l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_mem(gmfn << PAGE_SHIFT); + l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT); int i; int errors = 0; + int limit; - if ( page_get_owner(pfn_to_page(gmfn)) != d ) + if ( !oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != d) ) FAILPT("domain doesn't own page"); + if ( oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != NULL) ) + FAILPT("bogus owner for snapshot page"); if ( page_get_owner(pfn_to_page(smfn)) != NULL ) FAILPT("shadow page mfn=0x%08x is owned by someone, domid=%d", smfn, page_get_owner(pfn_to_page(smfn))->id); +#if 0 if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) - @@ -974,40 +2167,62 @@ int check_l2_table( if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) ) FAILPT("hypervisor linear map inconsistent"); +#endif - if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> + if ( !shadow_mode_external(d) && + (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != ((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) ) + { FAILPT("hypervisor shadow linear map inconsistent %p %p", l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]), (smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + } - if ( !shadow_mode_translate(d) ) { - if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) != - ((v2m(page_get_owner(&frame_table[gmfn])->arch.mm_perdomain_pt) | - __PAGE_HYPERVISOR))) ) - FAILPT("hypervisor per-domain map inconsistent"); + if ( !shadow_mode_external(d) && + (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) != + ((__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR))) ) + { + FAILPT("hypervisor per-domain map inconsistent saw %p, expected (va=%p) %p", + l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]), + d->arch.mm_perdomain_pt, + (__pa(d->arch.mm_perdomain_pt) | __PAGE_HYPERVISOR)); } + if ( shadow_mode_external(d) ) + limit = L2_PAGETABLE_ENTRIES; + else + limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; + /* Check the whole L2. */ - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) - errors += check_pte(d, &l2_pgentry_val(gpl2e[i]), &l2_pgentry_val(spl2e[i]), 2, i, 0); + for ( i = 0; i < limit; i++ ) + errors += check_pte(d, &l2_pgentry_val(gpl2e[i]), &l2_pgentry_val(spl2e[i]), 2, i, 0, 0); unmap_domain_mem(spl2e); unmap_domain_mem(gpl2e); +#if 1 + if ( errors ) + printk("check_l2_table returning %d errors\n", errors); +#endif + return errors; } -int _check_pagetable(struct domain *d, pagetable_t pt, char *s) +int _check_pagetable(struct exec_domain *ed, char *s) { + struct domain *d = ed->domain; + pagetable_t pt = ed->arch.guest_table; unsigned long gptbase = pagetable_val(pt); - unsigned long ptbase_pfn, smfn, ss; + unsigned long ptbase_pfn, smfn; unsigned long i; l2_pgentry_t *gpl2e, *spl2e; unsigned long ptbase_mfn = 0; - int errors = 0; + int errors = 0, limit, oos_pdes = 0; + + audit_domain(d); + shadow_lock(d); sh_check_name = s; SH_VVLOG("%s-PT Audit", s); @@ -1017,30 +2232,31 @@ int _check_pagetable(struct domain *d, pagetable_t pt, char *s) ptbase_pfn = gptbase >> PAGE_SHIFT; ptbase_mfn = __gpfn_to_mfn(d, ptbase_pfn); - ss = __shadow_status(d, ptbase_pfn); - - if ( ! (ss & PSH_shadowed) ) + if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) ) { printk("%s-PT %p not shadowed\n", s, gptbase); errors++; - - if ( ss != 0 ) - BUG(); - return errors; - } + goto out; + } + if ( page_out_of_sync(pfn_to_page(ptbase_mfn)) ) + { + ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot); + oos_pdes = 1; + ASSERT(ptbase_mfn); + } - smfn = ss & PSH_pfn_mask; - - if ( ss != (PSH_shadowed | smfn) ) - FAILPT("ptbase shadow inconsistent1"); - - errors += check_l2_table(d, ptbase_pfn, smfn); + errors += check_l2_table(d, ptbase_mfn, smfn, oos_pdes); gpl2e = (l2_pgentry_t *) map_domain_mem( ptbase_mfn << PAGE_SHIFT ); spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT ); /* Go back and recurse. */ - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) + if ( shadow_mode_external(d) ) + limit = L2_PAGETABLE_ENTRIES; + else + limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; + + for ( i = 0; i < limit; i++ ) { unsigned long gl1pfn = l2_pgentry_val(gpl2e[i]) >> PAGE_SHIFT; unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn); @@ -1048,7 +2264,7 @@ int _check_pagetable(struct domain *d, pagetable_t pt, char *s) if ( l2_pgentry_val(spl2e[i]) != 0 ) { - errors += check_l1_table(d, gl1mfn, sl1mfn, i); + errors += check_l1_table(d, gl1pfn, gl1mfn, sl1mfn, i); } } @@ -1057,22 +2273,23 @@ int _check_pagetable(struct domain *d, pagetable_t pt, char *s) SH_VVLOG("PT verified : l2_present = %d, l1_present = %d", sh_l2_present, sh_l1_present); - -#if 1 + + out: if ( errors ) BUG(); -#endif + + shadow_unlock(d); return errors; } -int _check_all_pagetables(struct domain *d, char *s) +int _check_all_pagetables(struct exec_domain *ed, char *s) { - int i, j; + struct domain *d = ed->domain; + int i; struct shadow_status *a; unsigned long gmfn; int errors = 0; - int cpu; shadow_status_noswap = 1; @@ -1084,22 +2301,34 @@ int _check_all_pagetables(struct domain *d, char *s) for (i = 0; i < shadow_ht_buckets; i++) { a = &d->arch.shadow_ht[i]; - while ( a && a->pfn ) + while ( a && a->gpfn_and_flags ) { - gmfn = __gpfn_to_mfn(d, a->pfn); - switch ( frame_table[a->pfn].u.inuse.type_info & PGT_type_mask ) + gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask); + + switch ( a->gpfn_and_flags & PGT_type_mask ) { - case PGT_l1_page_table: - errors += check_l1_table(d, gmfn, a->smfn_and_flags & PSH_pfn_mask, 0); + case PGT_l1_shadow: + errors += check_l1_table(d, a->gpfn_and_flags & PGT_mfn_mask, + gmfn, a->smfn, 0); + break; + case PGT_l2_shadow: + errors += check_l2_table(d, gmfn, a->smfn, + page_out_of_sync(pfn_to_page(gmfn))); break; - case PGT_l2_page_table: - errors += check_l2_table(d, gmfn, a->smfn_and_flags & PSH_pfn_mask); + case PGT_l3_shadow: + case PGT_l4_shadow: + case PGT_hl2_shadow: + BUG(); // XXX - ought to fix this... + break; + case PGT_snapshot: break; default: errors++; - printk("unexpected page type 0x%08x, pfn=0x%08x, gmfn=0x%08x\n", - frame_table[gmfn].u.inuse.type_info, - a->pfn, gmfn); + printk("unexpected shadow type %p, gpfn=%p, " + "gmfn=%p smfn=%p\n", + a->gpfn_and_flags & PGT_type_mask, + a->gpfn_and_flags & PGT_mfn_mask, + gmfn, a->smfn); BUG(); } a = a->next; @@ -1108,52 +2337,8 @@ int _check_all_pagetables(struct domain *d, char *s) shadow_status_noswap = 0; - for (i = 0; i < 1024; i++) - { - if ( l2_pgentry_val(shadow_linear_l2_table[i]) & _PAGE_PRESENT ) - { - unsigned base = i << 10; - for (j = 0; j < 1024; j++) - { - if ( (l1_pgentry_val(shadow_linear_pg_table[base + j]) & PAGE_MASK) == 0x0143d000 ) - { - printk("sh_ln_pg_tb[0x%08x] => 0x%08lx ", - base + j, - l1_pgentry_val(shadow_linear_pg_table[base + j])); - if ( l1_pgentry_val(shadow_linear_pg_table[base + j]) & _PAGE_PRESENT ) - printk(" first entry => 0x%08lx\n", - *(unsigned long *)((base + j) << PAGE_SHIFT)); - else - printk(" page not present\n"); - } - } - } - } - if ( errors ) - { - printk("VM_ASSIST(d, VMASST_TYPE_writable_pagetables) => %d\n", - VM_ASSIST(d, VMASST_TYPE_writable_pagetables)); - for ( cpu = 0; cpu < smp_num_cpus; cpu++ ) - { - for ( j = 0; j < ARRAY_SIZE(ptwr_info->ptinfo); j++) - { - printk("ptwr_info[%d].ptinfo[%d].l1va => 0x%08x\n", - cpu, j, ptwr_info[cpu].ptinfo[j].l1va); - printk("ptwr_info[%d].ptinfo[%d].pl1e => 0x%08x\n", - cpu, j, ptwr_info[cpu].ptinfo[j].pl1e); - if (cpu == smp_processor_id()) - printk("v2m(ptwr_info[%d].ptinfo[%d].pl1e) => 0x%08x\n", - cpu, j, v2m(ptwr_info[cpu].ptinfo[j].pl1e)); - printk("ptwr_info[%d].ptinfo[%d].page => 0x%08x\n", - cpu, j, ptwr_info[cpu].ptinfo[j].page); - if (cpu == smp_processor_id()) - printk("v2m(ptwr_info[%d].ptinfo[%d].page) => 0x%08x\n", - cpu, j, v2m(ptwr_info[cpu].ptinfo[j].page)); - } - } BUG(); - } return errors; } diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 422c737c43..8516e0b59b 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -114,7 +114,7 @@ asmlinkage void fatal_trap(int trapnr, struct xen_regs *regs) if ( trapnr == TRAP_page_fault ) { __asm__ __volatile__ ("mov %%cr2,%0" : "=r" (cr2) : ); - printk("Faulting linear address might be %0lx %lx\n", cr2, cr2); + printk("Faulting linear address might be %p\n", cr2); } printk("************************************\n"); @@ -269,6 +269,8 @@ asmlinkage int do_page_fault(struct xen_regs *regs) DEBUGGER_trap_entry(TRAP_page_fault, regs); + //printk("do_page_fault(eip=%p, va=%p, code=%d)\n", regs->eip, addr, regs->error_code); + perfc_incrc(page_faults); if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) @@ -295,9 +297,12 @@ asmlinkage int do_page_fault(struct xen_regs *regs) UNLOCK_BIGLOCK(d); } - if ( unlikely(shadow_mode_enabled(d)) && - (addr < PAGE_OFFSET) && shadow_fault(addr, regs) ) + if ( unlikely(shadow_mode_enabled(d)) && + ((addr < PAGE_OFFSET) || shadow_mode_external(d)) && + shadow_fault(addr, regs) ) + { return EXCRET_fault_fixed; + } if ( unlikely(addr >= LDT_VIRT_START(ed)) && (addr < (LDT_VIRT_START(ed) + (ed->arch.ldt_ents*LDT_ENTRY_SIZE))) ) diff --git a/xen/arch/x86/vmx.c b/xen/arch/x86/vmx.c index ada1403714..5ef572ef02 100644 --- a/xen/arch/x86/vmx.c +++ b/xen/arch/x86/vmx.c @@ -106,6 +106,7 @@ static void inline __update_guest_eip(unsigned long inst_len) static int vmx_do_page_fault(unsigned long va, struct xen_regs *regs) { + struct exec_domain *ed = current; unsigned long eip; unsigned long gpte, gpa; int result; @@ -123,9 +124,9 @@ static int vmx_do_page_fault(unsigned long va, struct xen_regs *regs) * If vpagetable is zero, then we are still emulating 1:1 page tables, * and we should have never gotten here. */ - if ( !current->arch.guest_vtable ) + if ( !test_bit(VMX_CPU_STATE_PG_ENABLED, &ed->arch.arch_vmx.cpu_state) ) { - printk("vmx_do_page_fault while still running on 1:1 page table\n"); + printk("vmx_do_page_fault while running on 1:1 page table\n"); return 0; } @@ -269,21 +270,17 @@ static void vmx_vmexit_do_invlpg(unsigned long va) { unsigned long eip; struct exec_domain *ed = current; - unsigned int index; __vmread(GUEST_EIP, &eip); - VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg:eip=%p, va=%p", - eip, va); + VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg: eip=%p, va=%p", + eip, va); /* * We do the safest things first, then try to update the shadow * copying from guest */ shadow_invlpg(ed, va); - index = l2_table_offset(va); - ed->arch.hl2_vtable[index] = - mk_l2_pgentry(0); /* invalidate pgd cache */ } static void vmx_io_instruction(struct xen_regs *regs, @@ -428,14 +425,6 @@ static void mov_to_cr(int gp, int cr, struct xen_regs *regs) } old_base_mfn = pagetable_val(d->arch.guest_table) >> PAGE_SHIFT; - /* We know that none of the previous 1:1 shadow pages are - * going to be used again, so might as well flush them. - * XXXX wait until the last VCPU boots before doing the flush !! - */ - shadow_lock(d->domain); - free_shadow_state(d->domain); // XXX SMP - shadow_unlock(d->domain); - /* * Now arch.guest_table points to machine physical. */ @@ -469,7 +458,6 @@ static void mov_to_cr(int gp, int cr, struct xen_regs *regs) break; } - hl2_table_invalidate(d); /* * We make a new one if the shadow does not exist. */ @@ -482,8 +470,7 @@ static void mov_to_cr(int gp, int cr, struct xen_regs *regs) mfn = phys_to_machine_mapping(value >> PAGE_SHIFT); if ((mfn << PAGE_SHIFT) != pagetable_val(d->arch.guest_table)) __vmx_bug(regs); - vmx_shadow_clear_state(d->domain); - shadow_invalidate(d); + shadow_sync_all(d->domain); } else { /* * If different, make a shadow. Check if the PDBR is valid @@ -525,8 +512,6 @@ static void mov_to_cr(int gp, int cr, struct xen_regs *regs) */ if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) { vmx_shadow_clear_state(d->domain); - shadow_invalidate(d); - hl2_table_invalidate(d); } break; default: diff --git a/xen/arch/x86/x86_32/domain_build.c b/xen/arch/x86/x86_32/domain_build.c index d7cdc892c1..a0b2b94808 100644 --- a/xen/arch/x86/x86_32/domain_build.c +++ b/xen/arch/x86/x86_32/domain_build.c @@ -49,6 +49,8 @@ int construct_dom0(struct domain *d, char *image_start = (char *)_image_start; /* use lowmem mappings */ char *initrd_start = (char *)_initrd_start; /* use lowmem mappings */ + int shadow_dom0 = 1; // HACK ALERT !! Force dom0 to run in shadow mode. + /* * This fully describes the memory layout of the initial domain. All * *_start address are page-aligned, except v_start (and v_end) which are @@ -260,8 +262,14 @@ int construct_dom0(struct domain *d, l1tab += l1_table_offset(vpt_start); for ( count = 0; count < nr_pt_pages; count++ ) { - *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW); page = &frame_table[l1_pgentry_to_pfn(*l1tab)]; + + if ( !shadow_dom0 ) + *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW); + else + if ( !get_page_type(page, PGT_writable_page) ) + BUG(); + if ( count == 0 ) { page->u.inuse.type_info &= ~PGT_type_mask; @@ -380,13 +388,11 @@ int construct_dom0(struct domain *d, new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start); -#ifndef NDEBUG - if (0) /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */ + if ( shadow_dom0 ) { shadow_mode_enable(d, SHM_enable); update_pagetables(ed); /* XXX SMP */ } -#endif return 0; } diff --git a/xen/arch/x86/x86_32/domain_page.c b/xen/arch/x86/x86_32/domain_page.c index e3aa720c78..8f838205b3 100644 --- a/xen/arch/x86/x86_32/domain_page.c +++ b/xen/arch/x86/x86_32/domain_page.c @@ -91,6 +91,8 @@ void *map_domain_mem(unsigned long pa) void unmap_domain_mem(void *va) { unsigned int idx; + ASSERT((void *)MAPCACHE_VIRT_START <= va); + ASSERT(va < (void *)MAPCACHE_VIRT_END); idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT; mapcache[idx] |= READY_FOR_TLB_FLUSH; } diff --git a/xen/common/dom_mem_ops.c b/xen/common/dom_mem_ops.c index 3e7608b598..b4553f5d1c 100644 --- a/xen/common/dom_mem_ops.c +++ b/xen/common/dom_mem_ops.c @@ -14,6 +14,7 @@ #include #include #include +#include /* * To allow safe resume of do_dom_mem_op() after preemption, we need to know @@ -111,6 +112,27 @@ free_dom_mem(struct domain *d, if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); + if ( unlikely(shadow_mode_enabled(d)) ) + { + // XXX This needs more thought. This isn't pretty, + // and it's not fast. But it's a place holder. + // + shadow_lock(d); + if ( page_out_of_sync(page) ) + __shadow_sync_mfn(d, mpfn + j); + shadow_remove_all_access(d, mpfn + j); + + if (page->count_info != 1) + { + printk("free_dom_mem in shadow mode didn't release page " + "mfn=%p c=%p\n", mpfn+j, page->count_info); + shadow_unlock(d); + audit_domain(d); + BUG(); + } + shadow_unlock(d); + } + put_page(page); } } diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c index b1ed11162d..792913b779 100644 --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -188,7 +188,7 @@ void initialize_keytable(void) register_keyhandler( 'o', audit_domains_key, "audit domains >0 EXPERIMENTAL"); register_keyhandler( - 'T', debugtrace_key, "dump debugtrace"); + 'T', debugtrace_key, "toggle debugtrace to console/buffer"); #endif #ifdef PERF_COUNTERS diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index 7a727f1251..44256b441a 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * Comma-separated list of hexadecimal page numbers containing bad bytes. @@ -566,7 +567,23 @@ void free_domheap_pages(struct pfn_info *pg, unsigned int order) for ( i = 0; i < (1 << order); i++ ) { - ASSERT((pg[i].u.inuse.type_info & PGT_count_mask) == 0); + if ( ((pg[i].u.inuse.type_info & PGT_count_mask) != 0) && + shadow_mode_enabled(d) ) + { + // XXX This needs more thought... + // + printk("%s: needing to call shadow_remove_all_access for mfn=%p\n", + __func__, page_to_pfn(&pg[i])); + printk("Amfn=%p c=%p t=%p\n", page_to_pfn(&pg[i]), + pg[i].count_info, pg[i].u.inuse.type_info); + shadow_lock(d); + shadow_remove_all_access(d, page_to_pfn(&pg[i])); + shadow_unlock(d); + printk("Bmfn=%p c=%p t=%p\n", page_to_pfn(&pg[i]), + pg[i].count_info, pg[i].u.inuse.type_info); + } + + ASSERT( (pg[i].u.inuse.type_info & PGT_count_mask) == 0 ); pg[i].tlbflush_timestamp = tlbflush_current_time(); pg[i].u.free.cpu_mask = cpu_mask; list_del(&pg[i].list); diff --git a/xen/common/schedule.c b/xen/common/schedule.c index f5d4727ff7..f28a352eed 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -423,6 +423,9 @@ void __enter_scheduler(void) perfc_incrc(sched_ctx); + // Q: With full shadow mode, do we need to flush out-of-sync pages + // before switching domains? Current belief is NO. + if ( !is_idle_task(prev->domain) ) { LOCK_BIGLOCK(prev->domain); diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index d8bb48043d..02276cbb55 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -35,11 +35,21 @@ struct arch_domain unsigned int shadow_dirty_bitmap_size; /* in pages, bit per page */ /* shadow mode stats */ - unsigned int shadow_page_count; - unsigned int shadow_fault_count; - unsigned int shadow_dirty_count; - unsigned int shadow_dirty_net_count; - unsigned int shadow_dirty_block_count; + unsigned int shadow_page_count; + unsigned int hl2_page_count; + unsigned int snapshot_page_count; + + unsigned int shadow_fault_count; + unsigned int shadow_dirty_count; + unsigned int shadow_dirty_net_count; + unsigned int shadow_dirty_block_count; + + /* full shadow mode */ + struct out_of_sync_entry *out_of_sync; /* list of out-of-sync pages */ + struct out_of_sync_entry *out_of_sync_free; + struct out_of_sync_entry *out_of_sync_extras; + unsigned int out_of_sync_extras_count; + } __cacheline_aligned; struct arch_exec_domain @@ -109,8 +119,8 @@ struct arch_exec_domain l2_pgentry_t *guest_vtable; /* virtual address of pagetable */ l2_pgentry_t *shadow_vtable; /* virtual address of shadow_table */ - l2_pgentry_t *hl2_vtable; /* virtual address of hl2_table */ l2_pgentry_t *monitor_vtable; /* virtual address of monitor_table */ + l1_pgentry_t *hl2_vtable; /* virtual address of hl2_table */ /* Virtual CR2 value. Can be read/written by guest. */ unsigned long guest_cr2; diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 3e4b1d4b0b..7cb895e9fc 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -69,7 +69,16 @@ struct pfn_info #define PGT_gdt_page (5<<29) /* using this page in a GDT? */ #define PGT_ldt_page (6<<29) /* using this page in an LDT? */ #define PGT_writable_page (7<<29) /* has writable mappings of this page? */ + +#define PGT_l1_shadow PGT_l1_page_table +#define PGT_l2_shadow PGT_l2_page_table +#define PGT_l3_shadow PGT_l3_page_table +#define PGT_l4_shadow PGT_l4_page_table +#define PGT_hl2_shadow (5<<29) +#define PGT_snapshot (6<<29) + #define PGT_type_mask (7<<29) /* Bits 29-31. */ + /* Has this page been validated for use as its current type? */ #define _PGT_validated 28 #define PGT_validated (1U<<_PGT_validated) @@ -86,11 +95,19 @@ struct pfn_info /* 17-bit count of uses of this frame as its current type. */ #define PGT_count_mask ((1U<<17)-1) +#define PGT_mfn_mask ((1U<<21)-1) /* mfn mask for shadow types */ + /* Cleared when the owning guest 'frees' this page. */ #define _PGC_allocated 31 #define PGC_allocated (1U<<_PGC_allocated) - /* 31-bit count of references to this frame. */ -#define PGC_count_mask ((1U<<31)-1) + /* Set when fullshadow mode marks a page out-of-sync */ +#define _PGC_out_of_sync 30 +#define PGC_out_of_sync (1U<<_PGC_out_of_sync) + /* Set when fullshadow mode is using a page as a page table */ +#define _PGC_page_table 29 +#define PGC_page_table (1U<<_PGC_page_table) + /* 29-bit count of references to this frame. */ +#define PGC_count_mask ((1U<<29)-1) /* We trust the slab allocator in slab.c, and our use of it. */ #define PageSlab(page) (1) @@ -112,6 +129,8 @@ static inline u32 pickle_domptr(struct domain *domain) #define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain)) #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d)) +#define page_out_of_sync(_p) ((_p)->count_info & PGC_out_of_sync) + #define SHARE_PFN_WITH_DOMAIN(_pfn, _dom) \ do { \ page_set_owner((_pfn), (_dom)); \ @@ -135,6 +154,11 @@ void init_frametable(void); int alloc_page_type(struct pfn_info *page, unsigned int type); void free_page_type(struct pfn_info *page, unsigned int type); +extern void invalidate_shadow_ldt(struct exec_domain *d); +extern u32 shadow_remove_all_write_access( + struct domain *d, unsigned min_type, unsigned max_type, + unsigned long gpfn); +extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn); static inline void put_page(struct pfn_info *page) { @@ -166,8 +190,10 @@ static inline int get_page(struct pfn_info *page, unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ unlikely(d != _domain) ) /* Wrong owner? */ { - DPRINTK("Error pfn %p: ed=%p, sd=%p, caf=%08x, taf=%08x\n", - page_to_pfn(page), domain, unpickle_domptr(d), + DPRINTK("Error pfn %p: rd=%p(%d), od=%p(%d), caf=%08x, taf=%08x\n", + page_to_pfn(page), domain, (domain ? domain->id : -1), + page_get_owner(page), + (page_get_owner(page) ? page_get_owner(page)->id : -1), x, page->u.inuse.type_info); return 0; } @@ -184,6 +210,8 @@ static inline int get_page(struct pfn_info *page, void put_page_type(struct pfn_info *page); int get_page_type(struct pfn_info *page, u32 type); +int get_page_from_l1e(l1_pgentry_t l1e, struct domain *d); +void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d); static inline void put_page_and_type(struct pfn_info *page) { @@ -207,6 +235,22 @@ static inline int get_page_and_type(struct pfn_info *page, return rc; } +static inline int mfn_is_page_table(unsigned long mfn) +{ + if ( !pfn_is_ram(mfn) ) + return 0; + + return frame_table[mfn].count_info & PGC_page_table; +} + +static inline int page_is_page_table(struct pfn_info *page) +{ + if ( !pfn_is_ram(page_to_pfn(page)) ) + return 0; + + return page->count_info & PGC_page_table; +} + #define ASSERT_PAGE_IS_TYPE(_p, _t) \ ASSERT(((_p)->u.inuse.type_info & PGT_type_mask) == (_t)); \ ASSERT(((_p)->u.inuse.type_info & PGT_count_mask) != 0) @@ -307,6 +351,7 @@ void ptwr_flush(const int); int ptwr_do_page_fault(unsigned long); int new_guest_cr3(unsigned long pfn); +void propagate_page_fault(unsigned long addr, u16 error_code); #define __cleanup_writable_pagetable(_what) \ do { \ @@ -326,14 +371,24 @@ do { \ PTWR_CLEANUP_INACTIVE); \ } while ( 0 ) +int audit_adjust_pgtables(struct domain *d, int dir, int noisy); + #ifndef NDEBUG -void audit_domain(struct domain *d); + +#define AUDIT_ALREADY_LOCKED ( 1u << 0 ) +#define AUDIT_ERRORS_OK ( 1u << 1 ) +#define AUDIT_QUIET ( 1u << 2 ) + +void _audit_domain(struct domain *d, int flags, const char *file, int line); +#define audit_domain(_d) _audit_domain((_d), 0, __FILE__, __LINE__) void audit_domains(void); + #else + +#define _audit_domain(_d, _f, _file, _line) ((void)0) #define audit_domain(_d) ((void)0) #define audit_domains() ((void)0) -#endif -void propagate_page_fault(unsigned long addr, u16 error_code); +#endif #endif /* __ASM_X86_MM_H__ */ diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h index 1dad979f2e..660ba4cfd8 100644 --- a/xen/include/asm-x86/page.h +++ b/xen/include/asm-x86/page.h @@ -57,9 +57,11 @@ typedef struct { unsigned long pt_lo; } pagetable_t; #include #define linear_pg_table ((l1_pgentry_t *)LINEAR_PT_VIRT_START) -#define linear_l2_table ((l2_pgentry_t *)(LINEAR_PT_VIRT_START+(LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT)))) +#define __linear_l2_table ((l2_pgentry_t *)(LINEAR_PT_VIRT_START + \ + (LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT)))) +#define linear_l2_table(_ed) ((_ed)->arch.guest_vtable) -#define va_to_l1mfn(_va) (l2_pgentry_val(linear_l2_table[_va>>L2_PAGETABLE_SHIFT]) >> PAGE_SHIFT) +#define va_to_l1mfn(_ed, _va) (l2_pgentry_val(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]) >> PAGE_SHIFT) extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES]; diff --git a/xen/include/asm-x86/shadow.h b/xen/include/asm-x86/shadow.h index 670394ce60..89c09d004f 100644 --- a/xen/include/asm-x86/shadow.h +++ b/xen/include/asm-x86/shadow.h @@ -1,3 +1,22 @@ +/****************************************************************************** + * include/asm-x86/shadow.h + * + * Copyright (c) 2005 Michael A Fetterman + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ #ifndef _XEN_SHADOW_H #define _XEN_SHADOW_H @@ -8,11 +27,6 @@ #include #include -/* Shadow PT flag bits in shadow_status */ -#define PSH_shadowed (1<<31) /* page has a shadow. PFN points to shadow */ -#define PSH_hl2 (1<<30) /* page is an hl2 */ -#define PSH_pfn_mask ((1<<21)-1) - /* Shadow PT operation mode : shadow-mode variable in arch_domain. */ #define SHM_enable (1<<0) /* we're in one of the shadow modes */ @@ -26,8 +40,13 @@ #define shadow_mode_external(_d) ((_d)->arch.shadow_mode & SHM_external) #define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START) -#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \ +#define __shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \ (SH_LINEAR_PT_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))) +#define shadow_linear_l2_table(_ed) ((_ed)->arch.shadow_vtable) + +// easy access to the hl2 table (for translated but not external modes only) +#define __linear_hl2_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START + \ + (PERDOMAIN_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))) #define shadow_lock_init(_d) spin_lock_init(&(_d)->arch.shadow_lock) #define shadow_lock(_d) spin_lock(&(_d)->arch.shadow_lock) @@ -36,18 +55,86 @@ extern void shadow_mode_init(void); extern int shadow_mode_control(struct domain *p, dom0_shadow_control_t *sc); extern int shadow_fault(unsigned long va, struct xen_regs *regs); -extern void shadow_l1_normal_pt_update( - unsigned long pa, unsigned long gpte, - unsigned long *prev_spfn_ptr, l1_pgentry_t **prev_spl1e_ptr); -extern void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde); -extern void unshadow_table(unsigned long gpfn, unsigned int type); extern int shadow_mode_enable(struct domain *p, unsigned int mode); -extern void free_shadow_state(struct domain *d); extern void shadow_invlpg(struct exec_domain *, unsigned long); -extern unsigned long mk_hl2_table(struct exec_domain *ed); +extern struct out_of_sync_entry *shadow_mark_mfn_out_of_sync( + struct exec_domain *ed, unsigned long gpfn, unsigned long mfn); +extern void free_monitor_pagetable(struct exec_domain *ed); +extern void __shadow_sync_all(struct domain *d); +extern int __shadow_out_of_sync(struct exec_domain *ed, unsigned long va); + +static inline unsigned long __shadow_status( + struct domain *d, unsigned long gpfn, unsigned long stype); extern void vmx_shadow_clear_state(struct domain *); +/************************************************************************/ + +static void inline +__shadow_sync_mfn(struct domain *d, unsigned long mfn) +{ + if ( d->arch.out_of_sync ) + { + // XXX - could be smarter + // + __shadow_sync_all(d); + } +} + +static void inline +__shadow_sync_va(struct exec_domain *ed, unsigned long va) +{ + struct domain *d = ed->domain; + + if ( d->arch.out_of_sync && __shadow_out_of_sync(ed, va) ) + { + // XXX - could be smarter + // + __shadow_sync_all(ed->domain); + } +} + +static void inline +shadow_sync_all(struct domain *d) +{ + if ( unlikely(shadow_mode_enabled(d)) ) + { + shadow_lock(d); + + if ( d->arch.out_of_sync ) + __shadow_sync_all(d); + + ASSERT(d->arch.out_of_sync == NULL); + + shadow_unlock(d); + } +} + +// SMP BUG: This routine can't ever be used properly in an SMP context. +// It should be something like get_shadow_and_sync_va(). +// This probably shouldn't exist. +// +static void inline +shadow_sync_va(struct exec_domain *ed, unsigned long gva) +{ + struct domain *d = ed->domain; + if ( unlikely(shadow_mode_enabled(d)) ) + { + shadow_lock(d); + __shadow_sync_va(ed, gva); + shadow_unlock(d); + } +} + +extern void __shadow_mode_disable(struct domain *d); +static inline void shadow_mode_disable(struct domain *d) +{ + if ( shadow_mode_enabled(d) ) + __shadow_mode_disable(d); +} + +/************************************************************************/ + #define __mfn_to_gpfn(_d, mfn) \ ( (shadow_mode_translate(_d)) \ ? machine_to_phys_mapping[(mfn)] \ @@ -58,39 +145,41 @@ extern void vmx_shadow_clear_state(struct domain *); ? phys_to_machine_mapping(gpfn) \ : (gpfn) ) -extern void __shadow_mode_disable(struct domain *d); -static inline void shadow_mode_disable(struct domain *d) -{ - if ( shadow_mode_enabled(d) ) - __shadow_mode_disable(d); -} +/************************************************************************/ -extern unsigned long shadow_l2_table( - struct domain *d, unsigned long gmfn); - -static inline void shadow_invalidate(struct exec_domain *ed) { - if ( !VMX_DOMAIN(ed) ) - BUG(); - memset(ed->arch.shadow_vtable, 0, PAGE_SIZE); -} +struct shadow_status { + unsigned long gpfn_and_flags; /* Guest pfn plus flags. */ + struct shadow_status *next; /* Pull-to-front list. */ + unsigned long smfn; /* Shadow mfn. */ +}; + +#define shadow_ht_extra_size 128 +#define shadow_ht_buckets 256 + +struct out_of_sync_entry { + struct out_of_sync_entry *next; + unsigned long gpfn; /* why is this here? */ + unsigned long gmfn; + unsigned long snapshot_mfn; + unsigned long writable_pl1e; /* NB: this is a machine address */ +}; + +#define out_of_sync_extra_size 127 + +#define SHADOW_SNAPSHOT_ELSEWHERE (-1L) + +/************************************************************************/ #define SHADOW_DEBUG 0 #define SHADOW_VERBOSE_DEBUG 0 +#define SHADOW_VVERBOSE_DEBUG 0 #define SHADOW_HASH_DEBUG 0 +#define FULLSHADOW_DEBUG 0 #if SHADOW_DEBUG extern int shadow_status_noswap; #endif -struct shadow_status { - unsigned long pfn; /* Guest pfn. */ - unsigned long smfn_and_flags; /* Shadow mfn plus flags. */ - struct shadow_status *next; /* Pull-to-front list. */ -}; - -#define shadow_ht_extra_size 128 -#define shadow_ht_buckets 256 - #ifdef VERBOSE #define SH_LOG(_f, _a...) \ printk("DOM%uP%u: SH_LOG(%d): " _f "\n", \ @@ -99,7 +188,7 @@ struct shadow_status { #define SH_LOG(_f, _a...) #endif -#if SHADOW_DEBUG +#if SHADOW_VERBOSE_DEBUG #define SH_VLOG(_f, _a...) \ printk("DOM%uP%u: SH_VLOG(%d): " _f "\n", \ current->domain->id, current->processor, __LINE__ , ## _a ) @@ -107,7 +196,7 @@ struct shadow_status { #define SH_VLOG(_f, _a...) #endif -#if SHADOW_VERBOSE_DEBUG +#if SHADOW_VVERBOSE_DEBUG #define SH_VVLOG(_f, _a...) \ printk("DOM%uP%u: SH_VVLOG(%d): " _f "\n", \ current->domain->id, current->processor, __LINE__ , ## _a ) @@ -115,60 +204,148 @@ struct shadow_status { #define SH_VVLOG(_f, _a...) #endif -// BUG: mafetter: this assumes ed == current, so why pass ed? -static inline void __shadow_get_l2e( - struct exec_domain *ed, unsigned long va, unsigned long *sl2e) +#if FULLSHADOW_DEBUG +#define FSH_LOG(_f, _a...) \ + printk("DOM%uP%u: FSH_LOG(%d): " _f "\n", \ + current->domain->id, current->processor, __LINE__ , ## _a ) +#else +#define FSH_LOG(_f, _a...) +#endif + + +/************************************************************************/ + +static inline void +__shadow_get_l2e( + struct exec_domain *ed, unsigned long va, unsigned long *psl2e) { - if ( !likely(shadow_mode_enabled(ed->domain)) ) - BUG(); + ASSERT(shadow_mode_enabled(ed->domain)); - if ( shadow_mode_translate(ed->domain) ) - *sl2e = l2_pgentry_val( - ed->arch.shadow_vtable[l2_table_offset(va)]); - else - *sl2e = l2_pgentry_val( - shadow_linear_l2_table[l2_table_offset(va)]); + *psl2e = l2_pgentry_val( ed->arch.shadow_vtable[l2_table_offset(va)]); } -static inline void __shadow_set_l2e( +static inline void +__shadow_set_l2e( struct exec_domain *ed, unsigned long va, unsigned long value) { - if ( !likely(shadow_mode_enabled(ed->domain)) ) - BUG(); + ASSERT(shadow_mode_enabled(ed->domain)); - if ( shadow_mode_translate(ed->domain) ) - ed->arch.shadow_vtable[l2_table_offset(va)] = mk_l2_pgentry(value); - else - shadow_linear_l2_table[l2_table_offset(va)] = mk_l2_pgentry(value); + ed->arch.shadow_vtable[l2_table_offset(va)] = mk_l2_pgentry(value); } -static inline void __guest_get_l2e( - struct exec_domain *ed, unsigned long va, unsigned long *l2e) +static inline void +__guest_get_l2e( + struct exec_domain *ed, unsigned long va, unsigned long *pl2e) { - *l2e = ( shadow_mode_translate(ed->domain) ) ? - l2_pgentry_val(ed->arch.guest_vtable[l2_table_offset(va)]) : - l2_pgentry_val(linear_l2_table[l2_table_offset(va)]); + *pl2e = l2_pgentry_val(ed->arch.guest_vtable[l2_table_offset(va)]); } -static inline void __guest_set_l2e( +static inline void +__guest_set_l2e( struct exec_domain *ed, unsigned long va, unsigned long value) { - if ( shadow_mode_translate(ed->domain) ) + if ( unlikely(shadow_mode_translate(ed->domain)) ) { - unsigned long pfn; + unsigned long mfn = phys_to_machine_mapping(value >> PAGE_SHIFT); + unsigned long old_hl2e = + l1_pgentry_val(ed->arch.hl2_vtable[l2_table_offset(va)]); + unsigned long new_hl2e = + (mfn ? ((mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR) : 0); - pfn = phys_to_machine_mapping(value >> PAGE_SHIFT); - ed->arch.hl2_vtable[l2_table_offset(va)] = - mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); + // only do the ref counting if something important changed. + // + if ( (old_hl2e ^ new_hl2e) & (PAGE_MASK | _PAGE_PRESENT) ) + { + if ( new_hl2e & _PAGE_PRESENT ) + get_page_from_l1e(mk_l1_pgentry(new_hl2e), ed->domain); + if ( old_hl2e & _PAGE_PRESENT ) + put_page_from_l1e(mk_l1_pgentry(old_hl2e), ed->domain); + } - ed->arch.guest_vtable[l2_table_offset(va)] = mk_l2_pgentry(value); + ed->arch.hl2_vtable[l2_table_offset(va)] = mk_l1_pgentry(new_hl2e); } - else + + ed->arch.guest_vtable[l2_table_offset(va)] = mk_l2_pgentry(value); +} + +/************************************************************************/ + +/* + * Add another shadow reference to smfn. + */ +static inline int +get_shadow_ref(unsigned long smfn) +{ + u32 x, nx; + + ASSERT(pfn_is_ram(smfn)); + + x = frame_table[smfn].count_info; + nx = x + 1; + + if ( unlikely(nx == 0) ) { - linear_l2_table[l2_table_offset(va)] = mk_l2_pgentry(value); + printk("get_shadow_ref overflow, gmfn=%p smfn=%p\n", + frame_table[smfn].u.inuse.type_info & PGT_mfn_mask, smfn); + BUG(); } + + // Guarded by the shadow lock... + // + frame_table[smfn].count_info = nx; + + return 1; } +extern void free_shadow_page(unsigned long smfn); + +/* + * Drop a shadow reference to smfn. + */ +static inline void +put_shadow_ref(unsigned long smfn) +{ + u32 x, nx; + + ASSERT(pfn_is_ram(smfn)); + + x = frame_table[smfn].count_info; + nx = x - 1; + + if ( unlikely(x == 0) ) + { + printk("put_shadow_ref underflow, gmfn=%p smfn=%p\n", + frame_table[smfn].u.inuse.type_info & PGT_mfn_mask, smfn); + BUG(); + } + + // Guarded by the shadow lock... + // + frame_table[smfn].count_info = nx; + + if ( unlikely(nx == 0) ) + { + free_shadow_page(smfn); + } +} + +static inline void +shadow_pin(unsigned long smfn) +{ + ASSERT( !(frame_table[smfn].u.inuse.type_info & PGT_pinned) ); + + frame_table[smfn].u.inuse.type_info |= PGT_pinned; + get_shadow_ref(smfn); +} + +static inline void +shadow_unpin(unsigned long smfn) +{ + frame_table[smfn].u.inuse.type_info &= ~PGT_pinned; + put_shadow_ref(smfn); +} + + /************************************************************************/ static inline int __mark_dirty(struct domain *d, unsigned int mfn) @@ -179,7 +356,7 @@ static inline int __mark_dirty(struct domain *d, unsigned int mfn) ASSERT(spin_is_locked(&d->arch.shadow_lock)); ASSERT(d->arch.shadow_dirty_bitmap != NULL); - pfn = machine_to_phys_mapping[mfn]; + pfn = __mfn_to_gpfn(d, mfn); /* * Values with the MSB set denote MFNs that aren't really part of the @@ -226,23 +403,41 @@ static inline int mark_dirty(struct domain *d, unsigned int mfn) /************************************************************************/ +extern void shadow_mark_out_of_sync( + struct exec_domain *ed, unsigned long gpfn, unsigned long mfn, + unsigned long va); + static inline void l1pte_write_fault( - struct domain *d, unsigned long *gpte_p, unsigned long *spte_p) -{ + struct exec_domain *ed, unsigned long *gpte_p, unsigned long *spte_p, + unsigned long va) +{ + struct domain *d = ed->domain; unsigned long gpte = *gpte_p; - unsigned long spte = *spte_p; - unsigned long pfn = gpte >> PAGE_SHIFT; - unsigned long mfn = __gpfn_to_mfn(d, pfn); + unsigned long spte; + unsigned long gpfn = gpte >> PAGE_SHIFT; + unsigned long mfn = __gpfn_to_mfn(d, gpfn); + + //printk("l1pte_write_fault gmfn=%p\n", mfn); + + if ( unlikely(!mfn) ) + { + SH_LOG("l1pte_write_fault: invalid gpfn=%p", gpfn); + *spte_p = 0; + return; + } ASSERT(gpte & _PAGE_RW); gpte |= _PAGE_DIRTY | _PAGE_ACCESSED; + spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); + + SH_VVLOG("l1pte_write_fault: updating spte=0x%p gpte=0x%p", spte, gpte); if ( shadow_mode_log_dirty(d) ) - __mark_dirty(d, pfn); + __mark_dirty(d, mfn); - spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); + if ( mfn_is_page_table(mfn) ) + shadow_mark_out_of_sync(ed, gpfn, mfn, va); - SH_VVLOG("l1pte_write_fault: updating spte=0x%p gpte=0x%p", spte, gpte); *gpte_p = gpte; *spte_p = spte; } @@ -255,11 +450,21 @@ static inline void l1pte_read_fault( unsigned long pfn = gpte >> PAGE_SHIFT; unsigned long mfn = __gpfn_to_mfn(d, pfn); + if ( unlikely(!mfn) ) + { + SH_LOG("l1pte_read_fault: invalid gpfn=%p", pfn); + *spte_p = 0; + return; + } + gpte |= _PAGE_ACCESSED; spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); - if ( shadow_mode_log_dirty(d) || !(gpte & _PAGE_DIRTY) ) + if ( shadow_mode_log_dirty(d) || !(gpte & _PAGE_DIRTY) || + mfn_is_page_table(mfn) ) + { spte &= ~_PAGE_RW; + } SH_VVLOG("l1pte_read_fault: updating spte=0x%p gpte=0x%p", spte, gpte); *gpte_p = gpte; @@ -267,9 +472,8 @@ static inline void l1pte_read_fault( } static inline void l1pte_propagate_from_guest( - struct domain *d, unsigned long *gpte_p, unsigned long *spte_p) + struct domain *d, unsigned long gpte, unsigned long *spte_p) { - unsigned long gpte = *gpte_p; unsigned long spte = *spte_p; unsigned long pfn = gpte >> PAGE_SHIFT; unsigned long mfn = __gpfn_to_mfn(d, pfn); @@ -278,33 +482,36 @@ static inline void l1pte_propagate_from_guest( unsigned long old_spte = spte; #endif - /* Use 1:1 page table to identify MMIO address space */ - if ( shadow_mode_external(d) && mmio_space(gpte) ) { + if ( unlikely(!mfn) ) + { + // likely an MMIO address space mapping... + // *spte_p = 0; return; } - + spte = 0; if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == (_PAGE_PRESENT|_PAGE_ACCESSED) ) { spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK); - if ( shadow_mode_log_dirty(d) || !(gpte & _PAGE_DIRTY) ) + if ( shadow_mode_log_dirty(d) || + !(gpte & _PAGE_DIRTY) || + mfn_is_page_table(mfn) ) + { spte &= ~_PAGE_RW; + } } - + #if SHADOW_VERBOSE_DEBUG if ( old_spte || spte || gpte ) - SH_VVLOG("l1pte_propagate_from_guest: gpte=0x%p, old spte=0x%p, new spte=0x%p ", gpte, old_spte, spte); + debugtrace_printk("l1pte_propagate_from_guest: gpte=0x%p, old spte=0x%p, new spte=0x%p\n", gpte, old_spte, spte); #endif - *gpte_p = gpte; *spte_p = spte; } - - static inline void l2pde_general( struct domain *d, unsigned long *gpde_p, @@ -312,33 +519,104 @@ static inline void l2pde_general( unsigned long sl1mfn) { unsigned long gpde = *gpde_p; - unsigned long spde = *spde_p; + unsigned long spde; spde = 0; - - if ( sl1mfn != 0 ) + if ( (gpde & _PAGE_PRESENT) && (sl1mfn != 0) ) { spde = (gpde & ~PAGE_MASK) | (sl1mfn << PAGE_SHIFT) | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY; gpde |= _PAGE_ACCESSED; /* N.B. PDEs do not have a dirty bit. */ - /* Detect linear p.t. mappings and write-protect them. */ - if ( (frame_table[sl1mfn].u.inuse.type_info & PGT_type_mask) == - PGT_l2_page_table ) - { - if ( !shadow_mode_translate(d) ) - spde = gpde & ~_PAGE_RW; - - } + // XXX mafetter: Hmm... + // Shouldn't the dirty log be checked/updated here? + // Actually, it needs to be done in this function's callers. + // + *gpde_p = gpde; } - *gpde_p = gpde; *spde_p = spde; } +static inline void l2pde_propagate_from_guest( + struct domain *d, unsigned long *gpde_p, unsigned long *spde_p) +{ + unsigned long gpde = *gpde_p, sl1mfn; + + sl1mfn = __shadow_status(d, gpde >> PAGE_SHIFT, PGT_l1_shadow); + l2pde_general(d, gpde_p, spde_p, sl1mfn); +} + +/************************************************************************/ + +// returns true if a tlb flush is needed +// +static int inline +validate_pte_change( + struct domain *d, + unsigned long new_pte, + unsigned long *shadow_pte_p) +{ + unsigned long old_spte, new_spte; + + perfc_incrc(validate_pte_change); + +#if 0 + FSH_LOG("validate_pte(old=%p new=%p)\n", old_pte, new_pte); +#endif + + old_spte = *shadow_pte_p; + l1pte_propagate_from_guest(d, new_pte, shadow_pte_p); + new_spte = *shadow_pte_p; + + // only do the ref counting if something important changed. + // + if ( (old_spte ^ new_spte) & (PAGE_MASK | _PAGE_RW | _PAGE_PRESENT) ) + { + if ( new_spte & _PAGE_PRESENT ) + get_page_from_l1e(mk_l1_pgentry(new_spte), d); + if ( old_spte & _PAGE_PRESENT ) + put_page_from_l1e(mk_l1_pgentry(old_spte), d); + } + + // paranoia rules! + return 1; +} + +// returns true if a tlb flush is needed +// +static int inline +validate_pde_change( + struct domain *d, + unsigned long new_pde, + unsigned long *shadow_pde_p) +{ + unsigned long old_spde = *shadow_pde_p; + unsigned long new_spde; + + perfc_incrc(validate_pde_change); + + l2pde_propagate_from_guest(d, &new_pde, shadow_pde_p); + new_spde = *shadow_pde_p; + + // only do the ref counting if something important changed. + // + if ( (old_spde ^ new_spde) & (PAGE_MASK | _PAGE_PRESENT) ) + { + if ( new_spde & _PAGE_PRESENT ) + get_shadow_ref(new_spde >> PAGE_SHIFT); + if ( old_spde & _PAGE_PRESENT ) + put_shadow_ref(old_spde >> PAGE_SHIFT); + } + + // paranoia rules! + return 1; +} + /*********************************************************************/ #if SHADOW_HASH_DEBUG + static void shadow_audit(struct domain *d, int print) { int live = 0, free = 0, j = 0, abs; @@ -347,26 +625,25 @@ static void shadow_audit(struct domain *d, int print) for ( j = 0; j < shadow_ht_buckets; j++ ) { a = &d->arch.shadow_ht[j]; - if ( a->pfn ) + if ( a->gpfn_and_flags ) { live++; - ASSERT(a->smfn_and_flags & PSH_pfn_mask); + ASSERT(a->smfn); } else ASSERT(!a->next); - ASSERT( (a->pfn & ~PSH_hl2) < 0x00100000UL); + a = a->next; while ( a && (live < 9999) ) { live++; - if ( (a->pfn == 0) || (a->smfn_and_flags == 0) ) + if ( (a->gpfn_and_flags == 0) || (a->smfn == 0) ) { - printk("XXX live=%d pfn=%p sp=%p next=%p\n", - live, a->pfn, a->smfn_and_flags, a->next); + printk("XXX live=%d gpfn+flags=%p sp=%p next=%p\n", + live, a->gpfn_and_flags, a->smfn, a->next); BUG(); } - ASSERT( (a->pfn & ~PSH_hl2) < 0x00100000UL); - ASSERT(a->smfn_and_flags & PSH_pfn_mask); + ASSERT(a->smfn); a = a->next; } ASSERT(live < 9999); @@ -376,21 +653,26 @@ static void shadow_audit(struct domain *d, int print) free++; if ( print ) - printk("Xlive=%d free=%d\n",live,free); + printk("Xlive=%d free=%d\n", live, free); // BUG: this only works if there's only a single domain which is // using shadow tables. // - abs = ( perfc_value(shadow_l1_pages) + - perfc_value(shadow_l2_pages) + - perfc_value(hl2_table_pages) ) - live; + abs = ( + perfc_value(shadow_l1_pages) + + perfc_value(shadow_l2_pages) + + perfc_value(hl2_table_pages) + + perfc_value(snapshot_pages) + ) - live; #ifdef PERF_COUNTERS if ( (abs < -1) || (abs > 1) ) { - printk("live=%d free=%d l1=%d l2=%d hl2=%d\n", live, free, + printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d\n", + live, free, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages), - perfc_value(hl2_table_pages)); + perfc_value(hl2_table_pages), + perfc_value(snapshot_pages)); BUG(); } #endif @@ -411,30 +693,36 @@ static inline struct shadow_status *hash_bucket( * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace, * which, depending on full shadow mode, may or may not equal * its mfn). - * The shadow status it returns is a mfn. + * It returns the shadow's mfn, or zero if it doesn't exist. */ + static inline unsigned long __shadow_status( - struct domain *d, unsigned int gpfn) + struct domain *d, unsigned long gpfn, unsigned long stype) { struct shadow_status *p, *x, *head; + unsigned long key = gpfn | stype; ASSERT(spin_is_locked(&d->arch.shadow_lock)); + ASSERT(gpfn == (gpfn & PGT_mfn_mask)); + ASSERT(stype && !(stype & ~PGT_type_mask)); + + perfc_incrc(shadow_status_calls); x = head = hash_bucket(d, gpfn); p = NULL; - //SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, x); + //SH_VVLOG("lookup gpfn=%08x type=%08x bucket=%p", gpfn, stype, x); shadow_audit(d, 0); do { - ASSERT(x->pfn || ((x == head) && (x->next == NULL))); + ASSERT(x->gpfn_and_flags || ((x == head) && (x->next == NULL))); - if ( x->pfn == gpfn ) + if ( x->gpfn_and_flags == key ) { #if SHADOW_DEBUG if ( unlikely(shadow_status_noswap) ) - return x->smfn_and_flags; + return x->smfn; #endif /* Pull-to-front if 'x' isn't already the head item. */ if ( unlikely(x != head) ) @@ -445,13 +733,16 @@ static inline unsigned long __shadow_status( head->next = x; /* Swap 'x' contents with head contents. */ - SWAP(head->pfn, x->pfn); - SWAP(head->smfn_and_flags, x->smfn_and_flags); + SWAP(head->gpfn_and_flags, x->gpfn_and_flags); + SWAP(head->smfn, x->smfn); + } + else + { + perfc_incrc(shadow_status_hit_head); } - SH_VVLOG("lookup gpfn=%p => status=%p", - gpfn, head->smfn_and_flags); - return head->smfn_and_flags; + SH_VVLOG("lookup gpfn=%p => status=%p", key, head->smfn); + return head->smfn; } p = x; @@ -459,17 +750,68 @@ static inline unsigned long __shadow_status( } while ( x != NULL ); - SH_VVLOG("lookup gpfn=%p => status=0", gpfn); + SH_VVLOG("lookup gpfn=%p => status=0", key); + perfc_incrc(shadow_status_miss); return 0; } +/* + * Not clear if pull-to-front is worth while for this or not, + * as it generally needs to scan the entire bucket anyway. + * Much simpler without. + * + * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table. + */ +static inline unsigned long +shadow_max_pgtable_type(struct domain *d, unsigned long gpfn) +{ + struct shadow_status *x; + unsigned long pttype = PGT_none, type; + + ASSERT(spin_is_locked(&d->arch.shadow_lock)); + ASSERT(gpfn == (gpfn & PGT_mfn_mask)); + + x = hash_bucket(d, gpfn); + + while ( x && x->gpfn_and_flags ) + { + if ( (x->gpfn_and_flags & PGT_mfn_mask) == gpfn ) + { + type = x->gpfn_and_flags & PGT_type_mask; + + // Treat an HL2 as if it's an L1 + // + if ( type == PGT_hl2_shadow ) + type = PGT_l1_shadow; + + // Ignore snapshots -- they don't in and of themselves constitute + // treating a page as a page table + // + if ( type == PGT_snapshot ) + goto next; + + // Early exit if we found the max possible value + // + if ( type == PGT_base_page_table ) + return type; + + if ( type > pttype ) + pttype = type; + } + next: + x = x->next; + } + + return pttype; +} + /* * N.B. We can make this locking more fine grained (e.g., per shadow page) if * it ever becomes a problem, but since we need a spin lock on the hash table * anyway it's probably not worth being too clever. */ static inline unsigned long get_shadow_status( - struct domain *d, unsigned int gpfn ) + struct domain *d, unsigned long gpfn, unsigned long stype) { unsigned long res; @@ -481,65 +823,66 @@ static inline unsigned long get_shadow_status( * has changed type. If we're in log dirty mode, we should set the * appropriate bit in the dirty bitmap. * N.B. The VA update path doesn't use this and is handled independently. - - XXX need to think this through for vmx guests, but probably OK + * + * XXX need to think this through for vmx guests, but probably OK */ shadow_lock(d); if ( shadow_mode_log_dirty(d) ) - __mark_dirty(d, gpfn); + __mark_dirty(d, __gpfn_to_mfn(d, gpfn)); - if ( !(res = __shadow_status(d, gpfn)) ) + if ( !(res = __shadow_status(d, gpfn, stype)) ) shadow_unlock(d); return res; } -static inline void put_shadow_status( - struct domain *d) +static inline void put_shadow_status(struct domain *d) { shadow_unlock(d); } static inline void delete_shadow_status( - struct domain *d, unsigned int gpfn) + struct domain *d, unsigned int gpfn, unsigned int stype) { struct shadow_status *p, *x, *n, *head; + unsigned long key = gpfn | stype; ASSERT(spin_is_locked(&d->arch.shadow_lock)); - ASSERT(gpfn != 0); + ASSERT(gpfn && !(gpfn & ~PGT_mfn_mask)); + ASSERT(stype && !(stype & ~PGT_type_mask)); head = hash_bucket(d, gpfn); - SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, head); + SH_VLOG("delete gpfn=%p t=%p bucket=%p", gpfn, stype, head); shadow_audit(d, 0); /* Match on head item? */ - if ( head->pfn == gpfn ) + if ( head->gpfn_and_flags == key ) { if ( (n = head->next) != NULL ) { /* Overwrite head with contents of following node. */ - head->pfn = n->pfn; - head->smfn_and_flags = n->smfn_and_flags; + head->gpfn_and_flags = n->gpfn_and_flags; + head->smfn = n->smfn; /* Delete following node. */ head->next = n->next; /* Add deleted node to the free list. */ - n->pfn = 0; - n->smfn_and_flags = 0; + n->gpfn_and_flags = 0; + n->smfn = 0; n->next = d->arch.shadow_ht_free; d->arch.shadow_ht_free = n; } else { /* This bucket is now empty. Initialise the head node. */ - head->pfn = 0; - head->smfn_and_flags = 0; + head->gpfn_and_flags = 0; + head->smfn = 0; } goto found; @@ -550,14 +893,14 @@ static inline void delete_shadow_status( do { - if ( x->pfn == gpfn ) + if ( x->gpfn_and_flags == key ) { /* Delete matching node. */ p->next = x->next; /* Add deleted node to the free list. */ - x->pfn = 0; - x->smfn_and_flags = 0; + x->gpfn_and_flags = 0; + x->smfn = 0; x->next = d->arch.shadow_ht_free; d->arch.shadow_ht_free = x; @@ -573,34 +916,46 @@ static inline void delete_shadow_status( BUG(); found: + // release ref to page + put_page(pfn_to_page(__gpfn_to_mfn(d, gpfn))); + shadow_audit(d, 0); } - static inline void set_shadow_status( - struct domain *d, unsigned int gpfn, unsigned long s) + struct domain *d, unsigned long gpfn, + unsigned long smfn, unsigned long stype) { struct shadow_status *x, *head, *extra; int i; + unsigned long gmfn = __gpfn_to_mfn(d, gpfn); + unsigned long key = gpfn | stype; ASSERT(spin_is_locked(&d->arch.shadow_lock)); - ASSERT(gpfn != 0); - ASSERT(s & (PSH_shadowed | PSH_hl2)); + ASSERT(gpfn && !(gpfn & ~PGT_mfn_mask)); + ASSERT(pfn_is_ram(gmfn)); // XXX need to be more graceful + ASSERT(smfn && !(smfn & ~PGT_mfn_mask)); + ASSERT(stype && !(stype & ~PGT_type_mask)); x = head = hash_bucket(d, gpfn); - SH_VVLOG("set gpfn=%08x s=%p bucket=%p(%p)", gpfn, s, x, x->next); + SH_VLOG("set gpfn=%p smfn=%p t=%p bucket=%p(%p)", + gpfn, smfn, stype, x, x->next); shadow_audit(d, 0); + // grab a reference to the guest page to represent the entry in the shadow + // hash table + // + get_page(pfn_to_page(gmfn), d); + /* * STEP 1. If page is already in the table, update it in place. */ - do { - if ( x->pfn == gpfn ) + if ( x->gpfn_and_flags == key ) { - x->smfn_and_flags = s; + x->smfn = smfn; goto done; } @@ -613,10 +968,10 @@ static inline void set_shadow_status( */ /* If the bucket is empty then insert the new page as the head item. */ - if ( head->pfn == 0 ) + if ( head->gpfn_and_flags == 0 ) { - head->pfn = gpfn; - head->smfn_and_flags = s; + head->gpfn_and_flags = key; + head->smfn = smfn; ASSERT(head->next == NULL); goto done; } @@ -655,35 +1010,107 @@ static inline void set_shadow_status( d->arch.shadow_ht_free = x->next; /* Initialise the new node and insert directly after the head item. */ - x->pfn = gpfn; - x->smfn_and_flags = s; + x->gpfn_and_flags = key; + x->smfn = smfn; x->next = head->next; head->next = x; done: shadow_audit(d, 0); } - + +/************************************************************************/ + +extern void shadow_map_l1_into_current_l2(unsigned long va); + +void static inline +shadow_set_l1e(unsigned long va, unsigned long new_spte, int create_l1_shadow) +{ + struct exec_domain *ed = current; + struct domain *d = ed->domain; + unsigned long sl2e, old_spte; + +#if 0 + printk("shadow_set_l1e(va=%p, new_spte=%p, create=%d)\n", + va, new_spte, create_l1_shadow); +#endif + + __shadow_get_l2e(ed, va, &sl2e); + if ( !(sl2e & _PAGE_PRESENT) ) + { + /* + * Either the L1 is not shadowed, or the shadow isn't linked into + * the current shadow L2. + */ + if ( create_l1_shadow ) + { + perfc_incrc(shadow_set_l1e_force_map); + shadow_map_l1_into_current_l2(va); + } + else /* check to see if it exists; if so, link it in */ + { + unsigned long gpde = + l2_pgentry_val(linear_l2_table(ed)[l2_table_offset(va)]); + unsigned long gl1pfn = gpde >> PAGE_SHIFT; + unsigned long sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow); + + ASSERT( gpde & _PAGE_PRESENT ); + + if ( sl1mfn ) + { + perfc_incrc(shadow_set_l1e_unlinked); + get_shadow_ref(sl1mfn); + l2pde_general(d, &gpde, &sl2e, sl1mfn); + __guest_set_l2e(ed, va, gpde); + __shadow_set_l2e(ed, va, sl2e); + } + else + { + // no shadow exists, so there's nothing to do. + perfc_incrc(shadow_set_l1e_fail); + return; + } + } + } + + old_spte = l1_pgentry_val(shadow_linear_pg_table[l1_linear_offset(va)]); + shadow_linear_pg_table[l1_linear_offset(va)] = mk_l1_pgentry(new_spte); + + // only do the ref counting if something important changed. + // + if ( (old_spte ^ new_spte) & (PAGE_MASK | _PAGE_RW | _PAGE_PRESENT) ) + { + if ( new_spte & _PAGE_PRESENT ) + get_page_from_l1e(mk_l1_pgentry(new_spte), d); + if ( old_spte & _PAGE_PRESENT ) + put_page_from_l1e(mk_l1_pgentry(old_spte), d); + } +} + +/************************************************************************/ + static inline unsigned long gva_to_gpte(unsigned long gva) { - unsigned long gpde, gpte, pfn, index; + unsigned long gpde, gpte; struct exec_domain *ed = current; + ASSERT( shadow_mode_translate(current->domain) ); + __guest_get_l2e(ed, gva, &gpde); - if (!(gpde & _PAGE_PRESENT)) + if ( unlikely(!(gpde & _PAGE_PRESENT)) ) return 0; - index = l2_table_offset(gva); - - if (!l2_pgentry_val(ed->arch.hl2_vtable[index])) { - pfn = phys_to_machine_mapping(gpde >> PAGE_SHIFT); - ed->arch.hl2_vtable[index] = - mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - } + // This is actually overkill - we only need to make sure the hl2 + // is in-sync. + // + shadow_sync_va(ed, gva); if ( unlikely(__get_user(gpte, (unsigned long *) &linear_pg_table[gva >> PAGE_SHIFT])) ) + { + FSH_LOG("gva_to_gpte got a fault on gva=%p\n", gva); return 0; + } return gpte; } @@ -699,94 +1126,19 @@ static inline unsigned long gva_to_gpa(unsigned long gva) return (gpte & PAGE_MASK) + (gva & ~PAGE_MASK); } -static inline void hl2_table_invalidate(struct exec_domain *ed) -{ - /* - * Need to optimize this - */ - memset(ed->arch.hl2_vtable, 0, PAGE_SIZE); -} - -static inline void __update_pagetables(struct exec_domain *ed) -{ - struct domain *d = ed->domain; - unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT; - unsigned long gpfn = __mfn_to_gpfn(d, gmfn); - unsigned long smfn = __shadow_status(d, gpfn) & PSH_pfn_mask; - - SH_VVLOG("0: __update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn); - - if ( unlikely(smfn == 0) ) - smfn = shadow_l2_table(d, gmfn); - - ed->arch.shadow_table = mk_pagetable(smfn<arch.monitor_vtable; - l2_pgentry_t *gpl2e, *spl2e; - unsigned long hl2_status, hl2mfn, offset; - int need_flush = 0; - - if ( ed->arch.guest_vtable ) - unmap_domain_mem(ed->arch.guest_vtable); - if ( ed->arch.shadow_vtable ) - unmap_domain_mem(ed->arch.shadow_vtable); - if ( ed->arch.hl2_vtable ) - unmap_domain_mem(ed->arch.hl2_vtable); - - gpl2e = ed->arch.guest_vtable = - map_domain_mem(pagetable_val(ed->arch.guest_table)); - spl2e = ed->arch.shadow_vtable = - map_domain_mem(pagetable_val(ed->arch.shadow_table)); - - hl2_status = __shadow_status(d, gpfn | PSH_hl2); - if ( unlikely(!(hl2_status & PSH_hl2)) ) - hl2_status = mk_hl2_table(ed); - - hl2mfn = hl2_status & PSH_pfn_mask; - ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT); - - offset = l2_table_offset(LINEAR_PT_VIRT_START); - if ( hl2mfn != (l2_pgentry_val(mpl2e[offset]) >> PAGE_SHIFT) ) - { - mpl2e[offset] = - mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - need_flush = 1; - } - - if ( shadow_mode_external(d ) ) - { - offset = l2_table_offset(SH_LINEAR_PT_VIRT_START); - if ( smfn != (l2_pgentry_val(mpl2e[offset]) >> PAGE_SHIFT) ) - { - mpl2e[offset] = - mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR); - need_flush = 1; - } - } - - if ( VMX_DOMAIN(ed) ) - { - // Why is VMX mode doing this? - shadow_invalidate(ed); - hl2_table_invalidate(ed); - } - - if ( need_flush ) - local_flush_tlb(); - } -} +/************************************************************************/ +extern void __update_pagetables(struct exec_domain *ed); static inline void update_pagetables(struct exec_domain *ed) { struct domain *d = ed->domain; - int paging_enabled = + #ifdef CONFIG_VMX + int paging_enabled = !VMX_DOMAIN(ed) || test_bit(VMX_CPU_STATE_PG_ENABLED, &ed->arch.arch_vmx.cpu_state); #else - 1; + const int paging_enabled = 1; #endif /* @@ -802,7 +1154,7 @@ static inline void update_pagetables(struct exec_domain *ed) shadow_unlock(d); } - if ( !shadow_mode_external(d) ) + if ( likely(!shadow_mode_external(d)) ) { #ifdef __x86_64__ if ( !(ed->arch.flags & TF_kernel_mode) ) @@ -814,26 +1166,17 @@ static inline void update_pagetables(struct exec_domain *ed) else ed->arch.monitor_table = ed->arch.guest_table; } - else - { - // External page tables... - // Allocate a monitor page table if we don't already have one. - // - if ( unlikely(!pagetable_val(ed->arch.monitor_table)) ) - ed->arch.monitor_table = - mk_pagetable(alloc_monitor_pagetable(ed) << PAGE_SHIFT); - } } #if SHADOW_DEBUG -extern int _check_pagetable(struct domain *d, pagetable_t pt, char *s); -extern int _check_all_pagetables(struct domain *d, char *s); +extern int _check_pagetable(struct exec_domain *ed, char *s); +extern int _check_all_pagetables(struct exec_domain *ed, char *s); -#define check_pagetable(_d, _pt, _s) _check_pagetable(_d, _pt, _s) -//#define check_pagetable(_d, _pt, _s) _check_all_pagetables(_d, _s) +#define check_pagetable(_ed, _s) _check_pagetable(_ed, _s) +//#define check_pagetable(_ed, _s) _check_all_pagetables(_ed, _s) #else -#define check_pagetable(_d, _pt, _s) ((void)0) +#define check_pagetable(_ed, _s) ((void)0) #endif #endif /* XEN_SHADOW_H */ diff --git a/xen/include/asm-x86/x86_32/page.h b/xen/include/asm-x86/x86_32/page.h index b75df5ca28..897ac2d0bd 100644 --- a/xen/include/asm-x86/x86_32/page.h +++ b/xen/include/asm-x86/x86_32/page.h @@ -68,7 +68,7 @@ typedef l2_pgentry_t root_pgentry_t; #define L1_DISALLOW_MASK (3UL << 7) #define L2_DISALLOW_MASK (7UL << 7) #define L3_DISALLOW_MASK (7UL << 7) -#define L2_DISALLOW_MASK (7UL << 7) +#define L4_DISALLOW_MASK (7UL << 7) #endif /* __X86_32_PAGE_H__ */ diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h index 8db16e2512..15db59d73d 100644 --- a/xen/include/xen/domain.h +++ b/xen/include/xen/domain.h @@ -27,6 +27,4 @@ extern void domain_relinquish_memory(struct domain *d); extern void dump_pageframe_info(struct domain *d); -extern unsigned long alloc_monitor_pagetable(struct exec_domain *ed); - #endif /* __XEN_DOMAIN_H__ */ diff --git a/xen/include/xen/perfc_defn.h b/xen/include/xen/perfc_defn.h index a252af7ac7..895c50e09a 100644 --- a/xen/include/xen/perfc_defn.h +++ b/xen/include/xen/perfc_defn.h @@ -1,3 +1,7 @@ +#define VMX_PERF_EXIT_REASON_SIZE 37 +#define VMX_PERF_VECTOR_SIZE 0x20 +PERFCOUNTER_ARRAY(vmexits, "vmexits", VMX_PERF_EXIT_REASON_SIZE ) +PERFCOUNTER_ARRAY(cause_vector, "cause vector", VMX_PERF_VECTOR_SIZE ) PERFCOUNTER_CPU (seg_fixups, "segmentation fixups" ) @@ -17,26 +21,42 @@ PERFCOUNTER_CPU( need_flush_tlb_flush, "PG_need_flush tlb flushes" ) PERFCOUNTER_CPU( calls_to_mmu_update, "calls_to_mmu_update" ) PERFCOUNTER_CPU( num_page_updates, "num_page_updates" ) PERFCOUNTER_CPU( calls_to_update_va, "calls_to_update_va_map" ) -PERFCOUNTER_CPU( page_faults, "page faults" ) -PERFCOUNTER_CPU( copy_user_faults, "copy_user faults" ) PERFCOUNTER_CPU( map_domain_mem_count, "map_domain_mem count" ) -PERFCOUNTER_CPU( shadow_l2_table_count, "shadow_l2_table count" ) -PERFCOUNTER_CPU( shadow_l1_table_count, "shadow_l1_table count" ) -PERFCOUNTER_CPU( unshadow_table_count, "unshadow_table count" ) -PERFCOUNTER_CPU( shadow_fixup_count, "shadow_fixup count" ) -PERFCOUNTER_CPU( shadow_update_va_fail1, "shadow_update_va_fail1" ) -PERFCOUNTER_CPU( shadow_update_va_fail2, "shadow_update_va_fail2" ) +PERFCOUNTER_CPU( shadow_l2_table_count, "shadow_l2_table count" ) +PERFCOUNTER_CPU( shadow_l1_table_count, "shadow_l1_table count" ) +PERFCOUNTER_CPU( shadow_hl2_table_count, "shadow_hl2_table count" ) +PERFCOUNTER_CPU( shadow_set_l1e_force_map, "shadow_set_l1e forced to map l1" ) +PERFCOUNTER_CPU( shadow_set_l1e_unlinked, "shadow_set_l1e found unlinked l1" ) +PERFCOUNTER_CPU( shadow_set_l1e_fail, "shadow_set_l1e failed (no sl1)" ) +PERFCOUNTER_CPU( shadow_invlpg_faults, "shadow_invlpg's get_user faulted") + /* STATUS counters do not reset when 'P' is hit */ PERFSTATUS( shadow_l2_pages, "current # shadow L2 pages" ) PERFSTATUS( shadow_l1_pages, "current # shadow L1 pages" ) PERFSTATUS( hl2_table_pages, "current # hl2 pages" ) - -PERFCOUNTER_CPU( check_pagetable, "calls to check_pagetable" ) -PERFCOUNTER_CPU( check_all_pagetables, "calls to check_all_pagetables" ) - -#define VMX_PERF_EXIT_REASON_SIZE 37 -#define VMX_PERF_VECTOR_SIZE 0x20 -PERFCOUNTER_ARRAY(vmexits, "vmexits", VMX_PERF_EXIT_REASON_SIZE ) -PERFCOUNTER_ARRAY(cause_vector, "cause vector", VMX_PERF_VECTOR_SIZE ) +PERFSTATUS( snapshot_pages, "current # fshadow snapshot pages" ) + +PERFCOUNTER_CPU(shadow_status_calls, "calls to __shadow_status" ) +PERFCOUNTER_CPU(shadow_status_miss, "missed shadow cache" ) +PERFCOUNTER_CPU(shadow_status_hit_head, "hits on head of bucket" ) +PERFCOUNTER_CPU(check_pagetable, "calls to check_pagetable" ) +PERFCOUNTER_CPU(check_all_pagetables, "calls to check_all_pagetables" ) + +PERFCOUNTER_CPU(shadow_sync_all, "calls to shadow_sync_all") +PERFCOUNTER_CPU(shadow_make_snapshot, "snapshots created") +PERFCOUNTER_CPU(shadow_mark_mfn_out_of_sync_calls, "calls to shadow_mk_out_of_sync") +PERFCOUNTER_CPU(shadow_out_of_sync_calls, "calls to shadow_out_of_sync") +PERFCOUNTER_CPU(snapshot_entry_matches_calls, "calls to ss_entry_matches") +PERFCOUNTER_CPU(snapshot_entry_matches_true, "ss_entry_matches returns true") + +PERFCOUNTER_CPU(page_faults, "page faults" ) +PERFCOUNTER_CPU(copy_user_faults, "copy_user faults" ) +PERFCOUNTER_CPU(shadow_fault_calls, "calls to shadow_fault") +PERFCOUNTER_CPU(shadow_fault_bail_pde_not_present, "sf bailed due to pde not present") +PERFCOUNTER_CPU(shadow_fault_bail_pte_not_present, "sf bailed due to pte not present") +PERFCOUNTER_CPU(shadow_fault_bail_ro_mapping, "sf bailed due to a ro mapping") +PERFCOUNTER_CPU(shadow_fault_fixed, "sf fixed the pgfault") +PERFCOUNTER_CPU(validate_pte_change, "calls to validate_pte_change") +PERFCOUNTER_CPU(validate_pde_change, "calls to validate_pde_change") -- 2.30.2